In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import torch
import open_clip
import numpy as np
import pandas as pd
from PIL import Image
from torchvision import transforms
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
file_path = "/fs/ess/PAS2136/Hawaii-2025/beetles_intake/BeetlePUUM/1. Completed_Data/CanonBeetles.csv"
canon_df = pd.read_csv(file_path)

In [4]:
canon_df["ImageFilePath"] = canon_df["cropped_image_path"].apply(lambda x: f"/fs/ess/PAS2136/Hawaii-2025/beetles_intake/BeetlePUUM/CANON/{x}")

In [5]:
cols = ['ImageFilePath', 'ScientificName']
df = canon_df[cols]
df.head(2)

Unnamed: 0,ImageFilePath,ScientificName
0,/fs/ess/PAS2136/Hawaii-2025/beetles_intake/Bee...,Mecyclothorax konanus
1,/fs/ess/PAS2136/Hawaii-2025/beetles_intake/Bee...,Mecyclothorax konanus


In [7]:
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")

tokenizer = open_clip.get_tokenizer("ViT-B-32")

device = "cuda" if torch.cuda.is_available() else "cpu"

model.to(device)

for param in model.parameters() :
    param.requires_grad = False


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [8]:
def extract_clip_features(image_path):
    
    image = Image.open(image_path).convert("RGB")
    
    image_tensor = preprocess_val(image).unsqueeze(0).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        image_features /= image_features.norm(dim=-1, keepdim=True)

    return image_features.cpu().numpy()
    

In [9]:
X = np.vstack([extract_clip_features(img) for img in df["ImageFilePath"]])

print(f"Extracted feature shape: {X.shape}")

le = LabelEncoder()

y = le.fit_transform(df["ScientificName"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Extracted feature shape: (1806, 512)


In [10]:
# Train MLP
MLP = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64), 
    max_iter=200,
    solver='adam', # lbfgs, sgd, adam
    activation='relu', # tanh, sgd, relu
    early_stopping = True,
    random_state=1645
)
MLP.fit(X_train_scaled, y_train)

# Evaluate
y_pred = MLP.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"CLIP + MLP Accuracy: {accuracy:.2%}")

CLIP + MLP Accuracy: 87.29%


In [11]:
unique_species = df["species"].unique()

species_ranges = {}
for species in unique_species:
    base_length = np.random.uniform(0.6, 1.0)  # Base length between 0.6 and 1.0
    base_width = np.random.uniform(0.3, 0.6)   # Base width between 0.3 and 0.6
    
    species_ranges[species] = {
        "length_range": (base_length - 0.1, base_length + 0.1),  # Small variation
        "width_range": (base_width - 0.1, base_width + 0.1)
    }


def assign_random_elytra(species):
    length_range = species_ranges[species]["length_range"]
    width_range = species_ranges[species]["width_range"]
    
    elytra_length = np.random.uniform(*length_range)
    elytra_width = np.random.uniform(*width_range)
    
    return elytra_length, elytra_width
    
df[["elytra_length", "elytra_width"]] = df["species"].apply(lambda sp: assign_random_elytra(sp)).apply(pd.Series)
df.head(2)


Unnamed: 0,annotation_uuid,ImageFileName,BeetleID,species,image_path,elytra_length,elytra_width
0,98bc02b2-baef-430e-b677-fb7f496455cf,IMG_0093.JPG,BET.D20.000001,Mecyclothorax konanus,/fs/ess/PAS2136/Rayeed/BeetlePUUM/CANON/indivi...,0.638269,0.580266
1,2a205423-aa6f-4fcd-b366-c0311e5790e2,IMG_0093.JPG,BET.D20.000003,Mecyclothorax konanus,/fs/ess/PAS2136/Rayeed/BeetlePUUM/CANON/indivi...,0.588267,0.52504


In [12]:
# Extract features
X_image = np.vstack([extract_clip_features(img) for img in df["image_path"]])
X_structured = df[["elytra_length", "elytra_width"]].to_numpy()
X_combined = np.hstack((X_image, X_structured))

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df["species"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, stratify=y, random_state=42)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [13]:
# Train MLP
MLP = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64), 
    max_iter=200,
    solver='adam', # lbfgs, sgd, adam
    activation='relu', # tanh, sgd, relu
    early_stopping = True,
    random_state=1645
)
MLP.fit(X_train_scaled, y_train)

# Evaluate
y_pred = MLP.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"CLIP + MLP Accuracy: {accuracy:.2%}")


CLIP + MLP Accuracy: 89.78%
