# Imports

In [24]:
import pandas as pd
import os
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

# Function to calculate number of files in a Directory

In [38]:
# Function to count the number of files in a given directory
def count_files_in_directory(directory):
    # List all files in the directory and filter out non-file items (e.g., subdirectories)
    # os.listdir() returns a list of all items in the directory
    # os.path.isfile() checks whether each item is a file
    return len([f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))])

In [26]:
image_pixel = []    # To store image pixel arrays
image_paths = []    # To store paths to the images
image_label = []    # To store corresponding labels for images (0,1)
image_subLabel = [] # To store corresponding sub-labels for images (good, manipulated_front, scratch_head, thread_side, thread_top)

## Image Categorization Function

This function processes images in a given directory, performs necessary preprocessing, and stores the image information (pixel data, label, sub-label, and image path) in predefined lists. The function is designed to handle images with filenames in a numerical sequence.

### Steps Performed:
1. **Image Loading**
2. **Image Resizing**: Each image is resized to a fixed size of 256x256 pixels.
3. **Pixel Array Conversion**: The image is converted to a NumPy array.
4. **Normalization**: The pixel values are normalized by dividing by 255, so they range between 0 and 1.
5. **Channel Expansion**: The image's dimensions are expanded to include a channel (e.g., for grayscale or RGB images).
6. **Data Storage**: 
   - The processed pixel data is appended to the `image_pixel` list.
   - The label and sub-label are appended to the `image_label` and `image_subLabel` lists, respectively.
   - The image path is stored in the `image_paths` list.

In [27]:
# Function to categorize images by resizing and storing pixel data, labels, and paths
def categorize_images(count, directory, label, sub_label):
    # Loop over the range of image count
    for j in range(1, count):
        # Create the image file suffix in 3-digit format
        img_suffix = f"{j:03d}"
        # Construct the full image path
        image_path = directory + "/" + img_suffix + ".png"
        
        try:
            # Open the image
            image = Image.open(image_path)
            # Resize the image to a fixed input size
            input_size = (256, 256)
            image = image.resize(input_size)
            
            # Convert the image to a numpy array
            image_array = np.array(image)
            # Normalize pixel values to [0, 1]
            image_array = image_array / 255.0
            # Expand dimensions to fit the model input (add channel dimension)
            image_array = np.expand_dims(image_array, axis=-1) 
            
            # Append the processed image data, label, sub-label, and image path
            image_pixel.append(image_array)
            image_label.append(label)
            image_subLabel.append(sub_label)
            image_paths.append(image_path)
            
        except FileNotFoundError:
            # Print a message if the image file is not found
            print(f"File {image_path} not found.")
        except Exception as e:
            # Catch any other exceptions and print the error message
            print(f"Error processing file {image_path}: {e}")


## Categorizing Images from Multiple Directories

In [28]:
# Categorize images from different directories with specific labels and sub-labels
categorize_images(count_files_in_directory("Data/good"), "Data/good", 0, "good")
categorize_images(count_files_in_directory("Data/manipulated_front"), "Data/manipulated_front", 1, "manipulated_front")
categorize_images(count_files_in_directory("Data/scratch_head"), "Data/scratch_head", 1, "scratch_head")
categorize_images(count_files_in_directory("Data/thread_side"), "Data/thread_side", 1, "thread_side")
categorize_images(count_files_in_directory("Data/thread_top"), "Data/thread_top", 1, "thread_top")


## Saving Categorized Image Data as a `.npz` File
**Function `np.savez`**:
   - The function `np.savez` is used to store the image data, labels, and sub-labels into a compressed `.npz` file.
   - The `image_pixel` array contains the preprocessed image data (with pixel values).
   - The `image_label` array contains the labels for each image (e.g., `0` for good, `1` for manipulated).
   - The `image_subLabel` array contains the sub-labels for each image (e.g., "good", "manipulated_front").

In [34]:
# Save categorized image data into a .npz file
np.savez('data.npz', 
         image=image_pixel, 
         label=image_label,
         sub_label=image_subLabel)