Butterfly Image Classification

Step 1: Import Required Libraries

In [1]:
# Libraries
import os
import numpy as np
import shutil
import h5py
from PIL import Image
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical




Step 2: Data exploration, Preprocessing and Preperation

In [2]:
# Load the CSV file
train_df = pd.read_csv('Training_set.csv')

# Display the first few rows to understand its structure
print(train_df.head())

# Function to load and preprocess images
def load_images(df):
    images = []
    labels = []
    for index, row in df.iterrows():
        filename = row['filename']
        label = row['label']
        img = Image.open(filename)
        images.append(img)
        labels.append(label)
    return images, labels

# Load images and labels
train_images, train_labels = load_images(train_df)
print("0", train_images[0], train_labels[0])

# Example of resizing images
def preprocess_images(images, size=(128, 128)):
    processed_images = []
    for img in images:
        img = img.resize(size)
        processed_images.append(img)
    return processed_images

# Example usage
train_images_resized = preprocess_images(train_images)
print(train_images_resized[0], train_labels[0])

            filename                     label
0  train/Image_1.jpg          SOUTHERN DOGFACE
1  train/Image_2.jpg                    ADONIS
2  train/Image_3.jpg            BROWN SIPROETA
3  train/Image_4.jpg                   MONARCH
4  train/Image_5.jpg  GREEN CELLED CATTLEHEART
0 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=224x224 at 0x24A0EBF9DD0> SOUTHERN DOGFACE
<PIL.Image.Image image mode=RGB size=128x128 at 0x24A155A2690> SOUTHERN DOGFACE


In [3]:
# Instantiate the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels to numeric values
numeric_labels = label_encoder.fit_transform(train_labels)

# Show the unique labels and their corresponding numeric values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)


{'ADONIS': 0, 'AFRICAN GIANT SWALLOWTAIL': 1, 'AMERICAN SNOOT': 2, 'AN 88': 3, 'APPOLLO': 4, 'ATALA': 5, 'BANDED ORANGE HELICONIAN': 6, 'BANDED PEACOCK': 7, 'BECKERS WHITE': 8, 'BLACK HAIRSTREAK': 9, 'BLUE MORPHO': 10, 'BLUE SPOTTED CROW': 11, 'BROWN SIPROETA': 12, 'CABBAGE WHITE': 13, 'CAIRNS BIRDWING': 14, 'CHECQUERED SKIPPER': 15, 'CHESTNUT': 16, 'CLEOPATRA': 17, 'CLODIUS PARNASSIAN': 18, 'CLOUDED SULPHUR': 19, 'COMMON BANDED AWL': 20, 'COMMON WOOD-NYMPH': 21, 'COPPER TAIL': 22, 'CRECENT': 23, 'CRIMSON PATCH': 24, 'DANAID EGGFLY': 25, 'EASTERN COMA': 26, 'EASTERN DAPPLE WHITE': 27, 'EASTERN PINE ELFIN': 28, 'ELBOWED PIERROT': 29, 'GOLD BANDED': 30, 'GREAT EGGFLY': 31, 'GREAT JAY': 32, 'GREEN CELLED CATTLEHEART': 33, 'GREY HAIRSTREAK': 34, 'INDRA SWALLOW': 35, 'IPHICLUS SISTER': 36, 'JULIA': 37, 'LARGE MARBLE': 38, 'MALACHITE': 39, 'MANGROVE SKIPPER': 40, 'MESTRA': 41, 'METALMARK': 42, 'MILBERTS TORTOISESHELL': 43, 'MONARCH': 44, 'MOURNING CLOAK': 45, 'ORANGE OAKLEAF': 46, 'ORANGE TI

Step 3: Model Selection and Training

In [4]:
# Function to load and preprocess images
def load_images(df):
    images = []
    labels = []
    for index, row in df.iterrows():
        filename = row['filename']
        label = row['label']
        try:
            img = Image.open(filename)
            img = img.resize((128, 128))  # Resize to a fixed size
            images.append(np.array(img))  # Convert image to numpy array
            labels.append(label)
        except Exception as e:
            print(f"Error loading image: {filename}. Error: {e}")
    return images, labels

# Load images and labels
train_images, train_labels = load_images(train_df)

# Convert images to numpy array for further processing
train_images = np.array(train_images)

# Encode labels to numeric values
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(train_labels)

# Convert numeric labels to one-hot encoded labels
one_hot_labels = to_categorical(numeric_labels)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_images, one_hot_labels, test_size=0.2, random_state=42)

# Normalize pixel values to [0, 1] range
X_train = X_train / 255.0
X_val = X_val / 255.0


In [5]:
# Define the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(75, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 126, 126, 32)      896       
                                                                 
 max_pooling2d (MaxPooling2  (None, 63, 63, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 61, 61, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 30, 30, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 28, 28, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 14, 14, 128)     

In [6]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)
# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")
# Save the model
model.save('butterfly_classifier_model.h5')

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Loss: 1.7582179307937622
Validation Accuracy: 0.5592307448387146


  saving_api.save_model(


Step 4: Evaluation and Testing

In [6]:
# Specify the full path to your model file
model_path = 'butterfly_classifier_model.h5'

# Check if the file exists
if os.path.exists(model_path):
    print(f"Model file '{model_path}' exists.")
else:
    print(f"Model file '{model_path}' does not exist.")
    exit(1)

# Attempt to load the model


try:
    with h5py.File(model_path, 'r') as f:
        model = tf.keras.models.load_model(f)
    print("Model loaded successfully.")
except OSError as e:
    print(f"Error loading model: {e}")
    exit(1)
except Exception as e:
    print(f"Error loading model: {e}")
    exit(1)

# Function to load and preprocess images
def load_images(df):
    images = []
    filenames = []
    for index, row in df.iterrows():
        filename = row['filename']
        try:
            img = Image.open(filename)
            img = img.resize((128, 128))  # Resize to a fixed size
            images.append(np.array(img))  # Convert image to numpy array
            filenames.append(filename)
        except Exception as e:
            print(f"Error loading image: {filename}. Error: {e}")
    return images, filenames

# Load the CSV file (assuming filenames are listed without labels)
test_df = pd.read_csv('Testing_set.csv')

# Load test images and filenames
test_images, test_filenames = load_images(test_df)

# Convert images to numpy array for further processing
test_images = np.array(test_images)

# Normalize pixel values to [0, 1] range
test_images = test_images / 255.0

# Make predictions on the test data
predictions = model.predict(test_images)

# Convert predictions from numeric indices to butterfly species names
predicted_classes = np.argmax(predictions, axis=1)
predicted_labels = [list(label_mapping.keys())[list(label_mapping.values()).index(cls)] for cls in predicted_classes]

# Add predicted labels to the DataFrame
test_df['label'] = predicted_labels

# Display the first few rows of the updated DataFrame
print(test_df.head())

# Save the updated CSV file with predicted labels
test_df.to_csv('Testing_set_with_predictions.csv', index=False)

Model file 'butterfly_classifier_model.h5' exists.
Model loaded successfully.
           filename                   label
0  test/Image_1.jpg              PINE WHITE
1  test/Image_2.jpg           CRIMSON PATCH
2  test/Image_3.jpg                  ADONIS
3  test/Image_4.jpg         IPHICLUS SISTER
4  test/Image_5.jpg  MILBERTS TORTOISESHELL
