In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Pulling the code from the google drive
The images are in seperate folders where we are considering the following 8 categories

1)Plastic
2)Cardboard
3)Metal
4)Glass
5)Medical Waste
6)Organic waste
7)Word
8)Paper

In [None]:
import os

# Define the folder path to your 'Final_DataSet' folder
folder_path = '/content/drive/My Drive/Final DataSet'

# Specify the categories (folder names) you want to count images from
categories = ['plastic', 'cardboard', 'metal', 'glass', 'medical',
              'Organic Waste', 'Wood', 'paper']

# Function to count the number of images in each category folder
def count_files_in_folders(folder_path):
    for root, dirs, files in os.walk(folder_path):
        folder_name = os.path.basename(root)

        # Check if the folder is one of the categories of interest
        if folder_name in categories:
            # Count only image files with the specified extensions
            num_images = len([f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))])

            # Print category and total number of images
            if num_images > 0:
                print(f"Category: {folder_name} | Total images: {num_images}")

# Call the function to count files
count_files_in_folders(folder_path)


Category: metal | Total images: 2259
Category: glass | Total images: 2518
Category: plastic | Total images: 2617
Category: paper | Total images: 2749
Category: medical | Total images: 1605
Category: Organic Waste | Total images: 277
Category: cardboard | Total images: 2332
Category: Wood | Total images: 347


The highest count of images among our categories is 2749 (Paper). We are setting this as the target count for balancing. This approach would require augmenting images in the under-represented categories.

Here under represented categories are
Wood - 347
medical -1605
cardboard - 2332

Use augmentation techniques (rotation, flipping, zooming, etc.) to generate new samples

In [None]:
import os
import random
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img, save_img

# Define the target number of images for balancing
target_num_images = 2749

# Data augmentation settings for balancing
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    zoom_range=0.15,
    horizontal_flip=True,
    fill_mode='nearest'
)

# List of categories with their current counts
categories = {
    'plastic': 2617,
    'cardboard': 2332,
    'metal': 2478,
    'glass': 2518,
    'medical': 1605,
    'Organic Waste': 277,  # Updated to include current count
    'Wood': 347,
    'paper': 2749
}

# Specify categories for augmentation (only Wood, Medical Waste, and Organic Waste)
augmented_categories = ['Wood','cardboard','medical', 'Organic Waste']

# Function to balance the dataset by augmenting images
def balance_dataset(dataset_path):
    for category, count in categories.items():
        if category in augmented_categories:  # Only augment specified categories
            print(f"Category: {category} | Total images: {count}")
            category_path = os.path.join(dataset_path, category)
            images = [img for img in os.listdir(category_path) if img.lower().endswith(('.png', '.jpg', '.jpeg'))]
            num_images = len(images)
            print(f"Balancing {category}: {num_images} -> {target_num_images}")

            # Create a new directory for augmented images
            augmented_dir = os.path.join(category_path, 'augmented')
            os.makedirs(augmented_dir, exist_ok=True)

            while num_images < target_num_images:
                img_path = os.path.join(category_path, random.choice(images))
                img = load_img(img_path)
                x = img_to_array(img)
                x = x.reshape((1,) + x.shape)  # Reshape for the generator

                # Generate augmented images
                for batch in datagen.flow(x, batch_size=1):
                    new_img_name = f"aug_{random.randint(0, 10000)}.jpg"
                    new_img_path = os.path.join(augmented_dir, new_img_name)  # Save in the new folder
                    save_img(new_img_path, batch[0])
                    num_images += 1
                    if num_images >= target_num_images:
                        break


# Path to your dataset (update this variable to your actual dataset path)
dataset_path = '/content/drive/My Drive/Final DataSet'

# Balance the dataset
balance_dataset(dataset_path)


Category: cardboard | Total images: 2332
Balancing cardboard: 2332 -> 2749
Category: medical | Total images: 1605
Balancing medical: 1605 -> 2749
Category: Organic Waste | Total images: 277
Balancing Organic Waste: 277 -> 2749
Category: Wood | Total images: 347
Balancing Wood: 347 -> 2749


In [None]:
import os

# Path to your dataset
dataset_path = '/content/drive/My Drive/Final DataSet'

# Dictionary to store the count of files in each folder
folder_file_counts = {}

# Loop through each folder in the dataset directory
for folder_name in os.listdir(dataset_path):
    folder_path = os.path.join(dataset_path, folder_name)
    # Check if the item is a folder
    if os.path.isdir(folder_path):
        # Count files in the folder (ignoring subfolders)
        file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
        folder_file_counts[folder_name] = file_count

# Print the count of files in each folder
for folder, count in folder_file_counts.items():
    print(f"Folder '{folder}': {count} files")


Folder 'duplicate_train': 19138 files
Folder 'Textiles': 335 files
Folder 'metal': 2259 files
Folder 'glass': 2518 files
Folder 'plastic': 2617 files
Folder 'paper': 2749 files
Folder 'e-waste': 2405 files
Folder 'medical': 1605 files
Folder 'Organic Waste': 277 files
Folder 'cardboard': 2332 files
Folder 'Wood': 347 files
Folder 'Aluminium': 763 files
Folder 'MobileNet Models': 6 files
Folder 'ResNet Models': 16 files
Folder 'Balanced': 2 files
Folder 'resized_train_images_by_anthoni': 6 files
Folder 'resized_train_final_Anthoni_Sagarika': 17116 files


In [None]:
import os

# Path to your dataset
dataset_path = '/content/drive/My Drive/Final DataSet'
augmented_categories = ['Wood', 'cardboard', 'medical', 'Organic Waste']

# Count images in 'augmented' folders for each specified category
augmented_counts = {}

for category in augmented_categories:
    augmented_path = os.path.join(dataset_path, category, 'augmented')
    if os.path.exists(augmented_path):
        # Count only files (assuming they are images) in the 'augmented' folder
        file_count = len([f for f in os.listdir(augmented_path) if os.path.isfile(os.path.join(augmented_path, f))])
        augmented_counts[category] = file_count
    else:
        augmented_counts[category] = 0  # If 'augmented' folder doesn't exist

# Print the counts
for category, count in augmented_counts.items():
    print(f"Category '{category}' has {count} images in the 'augmented' folder")


Category 'Wood' has 2111 images in the 'augmented' folder
Category 'cardboard' has 410 images in the 'augmented' folder
Category 'medical' has 1088 images in the 'augmented' folder
Category 'Organic Waste' has 2209 images in the 'augmented' folder


In [None]:
import os

# Path to your dataset
dataset_path = '/content/drive/My Drive/Final DataSet'

# Categories with initial counts
categories = {
    'plastic': 2617,
    'cardboard': 2332,
    'metal': 2478,
    'glass': 2518,
    'medical': 1605,
    'Organic Waste': 277,
    'Wood': 347,
    'paper': 2749
}

# Dictionary to store total counts for each category (including augmented images)
total_counts = {}

for category in categories:
    category_path = os.path.join(dataset_path, category)
    augmented_path = os.path.join(category_path, 'augmented')

    # Count original images in the main category folder
    original_count = len([f for f in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, f))])

    # Count images in the 'augmented' subfolder if it exists
    augmented_count = 0
    if os.path.exists(augmented_path):
        augmented_count = len([f for f in os.listdir(augmented_path) if os.path.isfile(os.path.join(augmented_path, f))])

    # Total count is the sum of original and augmented counts
    total_counts[category] = original_count + augmented_count

# Print the total counts for each category
for category, count in total_counts.items():
    print(f"Category '{category}' has a total of {count} images (including augmented images)")


Category 'plastic' has a total of 2617 images (including augmented images)
Category 'cardboard' has a total of 2742 images (including augmented images)
Category 'metal' has a total of 2259 images (including augmented images)
Category 'glass' has a total of 2518 images (including augmented images)
Category 'medical' has a total of 2693 images (including augmented images)
Category 'Organic Waste' has a total of 2486 images (including augmented images)
Category 'Wood' has a total of 2458 images (including augmented images)
Category 'paper' has a total of 2749 images (including augmented images)


In [None]:
# Print final counts after all processing
def print_final_counts():
    for category in categories.keys():
        if category in augmented_categories:
            print(f"Final total images for {category}: {target_num_images}")
        else:
            print(f"Total images for {category}: {categories[category]}")

print_final_counts()


AttributeError: 'list' object has no attribute 'keys'

Performing Feature extraction from the above mentioned categories using Inception V3

In [None]:
import os

# Define the folder path to your 'Final_DataSet' folder
folder_path = '/content/drive/My Drive/Final DataSet'

# Specify the categories (folder names) you want to count images from
categories = ['e-waste','Aluminium','Textiles']

# Function to count the number of images in each category folder
def count_files_in_folders(folder_path):
    for root, dirs, files in os.walk(folder_path):
        folder_name = os.path.basename(root)

        # Check if the folder is one of the categories of interest
        if folder_name in categories:
            # Count only image files with the specified extensions
            num_images = len([f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))])

            # Print category and total number of images
            if num_images > 0:
                print(f"Category: {folder_name} | Total images: {num_images}")

# Call the function to count files
count_files_in_folders(folder_path)


Category: Textiles | Total images: 335
Category: e-waste | Total images: 2405
Category: Aluminium | Total images: 763


Also performed the augumentation for Textile category and removed Aluminium

Train + Test Split

In [None]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import load_img, img_to_array, save_img

# Set the dataset path
dataset_path = '/content/drive/My Drive/Final DataSet'

# Create the target directories for training and testing data
train_dir = os.path.join(dataset_path, 'train')
test_dir = os.path.join(dataset_path, 'test')

# Create the main directories for train and test
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Define the categories (use your list of categories)
categories = ['plastic', 'cardboard', 'metal', 'glass', 'medical', 'Organic Waste', 'Wood', 'paper']

# Split the dataset into train and test
def split_data():
    for category in categories:
        category_path = os.path.join(dataset_path, category)

        # Create subdirectories for each category in train and test
        train_category_path = os.path.join(train_dir, category)
        test_category_path = os.path.join(test_dir, category)
        os.makedirs(train_category_path, exist_ok=True)
        os.makedirs(test_category_path, exist_ok=True)

        # List all the images in the category folder
        images = [img for img in os.listdir(category_path) if img.lower().endswith(('.png', '.jpg', '.jpeg'))]

        # Split the images into train and test
        train_images, test_images = train_test_split(images, test_size=0.2, random_state=42)

        # Move the images to respective folders
        for img_name in train_images:
            src = os.path.join(category_path, img_name)
            dst = os.path.join(train_category_path, img_name)
            shutil.copy(src, dst)

        for img_name in test_images:
            src = os.path.join(category_path, img_name)
            dst = os.path.join(test_category_path, img_name)
            shutil.copy(src, dst)

# Perform the split
split_data()

print("Train-test split is complete.")


KeyboardInterrupt: 