## Plant Specimen Image Classification
Description
Help advance biodiversity research by building an ML model to categorize plant specimen images, for the New York Botanical Garden. Using image classification machine learning techniques to train a model capable of distinguishing among the image classes represented in this dataset (i.e., sorting images into classes) with a high level of accuracy.

Problem
Solution

In [113]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [114]:
train_df = pd.read_csv("BTTAIxNYBG-train.csv")
validate_df = pd.read_csv("BTTAIxNYBG-validation.csv")
test_df = pd.read_csv("BTTAIxNYBG-test.csv")

In [115]:
# Load dataset & Define image directory
train_image_directory = 'BTTAIxNYBG-train/BTTAIxNYBG-train/'
validate_image_directory = 'BTTAIxNYBG-validation/BTTAIxNYBG-validation/' 
test_image_directory = 'BTTAIxNYBG-test/BTTAIxNYBG-test/'

In [116]:
# Preprocessing function to load and process images
def train_load_and_preprocess_image(filename, target_size=(224, 224)):
    img_path = os.path.join(train_image_directory, filename)
    img = image.load_img(img_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)  # Model expects a batch of images
    return img_array / 255.0  # Normalize to [0, 1]

In [117]:
# Preprocessing function to load and process images
def validate_load_and_preprocess_image(filename, target_size=(224, 224)):
    img_path = os.path.join(validate_image_directory, filename)
    img = image.load_img(img_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)  # Model expects a batch of images
    return img_array / 255.0  # Normalize to [0, 1]


In [118]:
train_df=train_df[:5000]
validate_df=validate_df[:5000]

In [119]:
# Apply preprocessing to all images
train_df['imageData'] = train_df['imageFile'].apply(train_load_and_preprocess_image)

In [120]:
validate_df['imageData'] = validate_df['imageFile'].apply(validate_load_and_preprocess_image)

In [121]:
validate_df.head()

Unnamed: 0,uniqueID,classLabel,classID,source,imageFile,imageData
0,7,microscope-slides,6,K,f09a8abe9e9e9ef2.jpg,"[[[[0.02745098 0.03137255 0.04705882], [0.0274..."
1,20,illustrations-color,2,BHL,a6c6868387c6af8c.jpg,"[[[[0.78039217 0.67058825 0.5254902 ], [0.7803..."
2,37,ordinary-pressed-specimens,9,YU,6062c68e8c34b292.jpg,"[[[[0.2509804 0.23529412 0.23921569], [0.2666..."
3,59,animal-specimens,0,CAS,363617271597dfd9.jpg,"[[[[0.52156866 0.5019608 0.49019608], [0.5215..."
4,60,biocultural-specimens,1,Met,4218d8d2f42b05e4.jpg,"[[[[0.32156864 0.3137255 0.31764707], [0.3215..."


In [122]:
# # Split dataset into training and validation sets
# ### Note: This is a common step in ML training, but in this challenge, since the validation set is provided separately, there is no need to call this function to distinguish between validation and train set.
# # train_df, validate_df = train_test_split(df, test_size=0.2, random_state=42)

# # Data augmentation configuration for training
# train_datagen = ImageDataGenerator(
#     rotation_range=40,
#     width_shift_range=0.2,
#     height_shift_range=0.2,
#     shear_range=0.2,
#     zoom_range=0.2,
#     horizontal_flip=True,
#     fill_mode='nearest'
# )

# # Note: No augmentation for validation data, only rescaling
# validation_datagen = ImageDataGenerator(rescale=1./255)


In [123]:
datagen=ImageDataGenerator(rescale=1./255)

In [124]:
# # Convert dataframe to a format suitable for the model training
# def train_df_to_dataset(dataframe, datagen, batch_size=32):
#     datagen.flow_from_dataframe(
#         dataframe=dataframe,
#         directory=train_image_directory,
#         x_col='imageFile',
#         y_col='classLabel',
#         target_size=(256, 256),
#         batch_size=batch_size,
#         class_mode='categorical'  # Change this if not a multiclass classification
#     )

# def validate_df_to_dataset(dataframe, datagen, batch_size=32):
#     datagen.flow_from_dataframe(
#         dataframe=dataframe,
#         directory=validate_image_directory,
#         x_col='imageFile',
#         y_col='classLabel',
#         target_size=(256, 256),
#         batch_size=batch_size,
#         class_mode='categorical'  # Change this if not a multiclass classification
#     )

In [125]:
# # Create datasets for training and validation
# train_dataset = train_df_to_dataset(split_train_df, datagen)
# validation_dataset = validate_df_to_dataset(validate_df, datagen)

# # This setup is now ready for training with model.fit using the train_dataset and validation_dataset

In [126]:
train_datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)
validate_datagen=ImageDataGenerator(rescale=1./255.)

In [127]:
# train_datagen=ImageDataGenerator(rescale=1./255.)
# validate_datagen=ImageDataGenerator(rescale=1./255.)

In [128]:
train_generator=datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=train_image_directory,
    x_col='imageFile',
    y_col='classLabel',
    # subset="training",
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="categorical",  
    target_size=(224, 224)
)

valid_generator=datagen.flow_from_dataframe(
    dataframe=validate_df,
    directory=validate_image_directory,
    x_col='imageFile',
    y_col='classLabel',
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="categorical",
    target_size=(224, 224)
)


Found 5000 validated image filenames belonging to 10 classes.
Found 5000 validated image filenames belonging to 10 classes.


In [129]:
test_datagen=ImageDataGenerator(rescale=1./255.)

In [130]:
test_generator=test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=test_image_directory,
    x_col='imageFile',
    batch_size=32,
    seed=42,
    class_mode=None,
    shuffle=False,
    target_size=(224, 224)
)

Found 30690 validated image filenames.


## CNN

In [131]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers

In [132]:
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same', input_shape=(224,224,3)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
# model.add(Conv2D(64, (3, 3)))
# model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))
# model.compile(optimizers.rmsprop(lr=0.0001, decay=1e-6),loss="categorical_crossentropy",metrics=["accuracy"])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_10 (Conv2D)          (None, 224, 224, 32)      896       
                                                                 
 activation_13 (Activation)  (None, 224, 224, 32)      0         
                                                                 
 conv2d_11 (Conv2D)          (None, 222, 222, 32)      9248      
                                                                 
 activation_14 (Activation)  (None, 222, 222, 32)      0         
                                                                 
 max_pooling2d_6 (MaxPoolin  (None, 111, 111, 32)      0         
 g2D)                                                            
                                                                 
 dropout_9 (Dropout)         (None, 111, 111, 32)      0         
                                                      

In [133]:
# model = tf.keras.models.Sequential([
#     # tf.keras.layers.RandomRotation(.25, input_shape=[256,256,1]),
#     tf.keras.layers.Conv2D(64, 7, padding="same", input_shape=[224,224,1]),
#     tf.keras.layers.MaxPooling2D(2),
#     tf.keras.layers.Conv2D(128, 3, padding="same", activation='relu'),
#     tf.keras.layers.Conv2D(128, 3, padding="same", activation='relu'),
#     tf.keras.layers.MaxPooling2D(2),
#     # tf.keras.layers.Conv2D(256, 3, padding="same", activation='relu'),
#     # tf.keras.layers.Conv2D(256, 3, padding="same", activation='relu'),
#     # tf.keras.layers.MaxPooling2D(2),
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(10, activation='softmax')

# ])
# model.summary()

In [134]:
# model = Sequential()
# model.add(Conv2D(256, kernel_size=(3,3), activation='relu',input_shape=(224,224,3)))
# model.add(MaxPooling2D(2,2))
# model.add(Conv2D(64, kernel_size=(5,5), activation='relu'))
# model.add(MaxPooling2D(2,2))
# model.add(Flatten())
# model.add(Dense(16, activation='relu'))
# model.add(Dense(3, activation='softmax'))

In [135]:
# STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
# STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
# model.fit_generator(generator=train_generator,
#                     steps_per_epoch=STEP_SIZE_TRAIN,
#                     validation_data=valid_generator,
#                     validation_steps=STEP_SIZE_VALID,
#                     epochs=10
# )

In [136]:
# model.compile(optimizer='adam',
#               loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#               metrics=['accuracy'])

In [137]:
# epochs=5
# history = model.fit(
#   train_generator,
#   validation_data=valid_generator,
#   epochs=epochs
# )

In [138]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [139]:
# model.fit(train_dataset, epochs=10, validation_data=(validation_dataset), batch_size=32)

In [140]:
model.fit(train_generator, epochs=10, validation_data=(valid_generator), batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2c6ea7460>

In [141]:
# history = model.fit(train_generator, epochs=10, steps_per_epoch=subset_size//batch_size, validation_data=valid_generator,
#                     validation_steps=subset_size//batch_size) #, callbacks=[checkpoint])

In [142]:
# history

In [143]:
# prediction = model.predict(valid_generator)

In [144]:
loss, accuracy = model.evaluate(valid_generator)
accuracy



0.8483999967575073

In [145]:
prediction = model.predict(test_generator)



In [146]:
# mean_squared_error(prediction, squared=False)

In [147]:
# loss, accuracy = model.evaluate(test_generator)
# accuracy  



0.0

In [148]:
results = pd.DataFrame({'uniqueID': test_df['uniqueID'], 'classID': prediction.argmax(axis=1)})

In [149]:
results.to_csv('submission.csv', index=False)

In [151]:
#df[['uniqueID', 'classID']].to_csv("submission.csv")
#y_pred.to_csv("submission.csv")