### 1) Setup and Imports

In [24]:
# Basic setup
import os, random, shutil, json, math, itertools, glob
from pathlib import Path
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)
SEED = 42
tf.keras.utils.set_random_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Kaggle I/O paths
DATA_DIR = "/kaggle/input/hh25306/Dataset"  # TODO: set this
WORK_DIR = "/kaggle/working"
os.makedirs(WORK_DIR, exist_ok=True)


2.18.0


### 2) Scan Dataset and Build File Manifest

In [25]:
# Read files from class-named folders and build a dataframe: filepath, label
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}

def list_images(root):
    root = Path(root)
    rows = []
    for cls_dir in sorted(p for p in root.iterdir() if p.is_dir()):
        label = cls_dir.name
        for img in cls_dir.rglob("*"):
            if img.suffix.lower() in IMG_EXTS:
                rows.append((str(img), label))
    return pd.DataFrame(rows, columns=["filepath", "label"])

df = list_images(DATA_DIR)
print("Total images:", len(df))
print("Classes:", df['label'].unique())
df.sample(5).head()


Total images: 2295
Classes: ['Endangered_species' 'algal_bloom' 'clear_water' 'deforestation'
 'forest_fire' 'good_air' 'no_deforestation' 'plastic' 'polluted_air']


Unnamed: 0,filepath,label
422,/kaggle/input/hh25306/Dataset/Endangered_speci...,Endangered_species
1289,/kaggle/input/hh25306/Dataset/forest_fire/fore...,forest_fire
208,/kaggle/input/hh25306/Dataset/Endangered_speci...,Endangered_species
1633,/kaggle/input/hh25306/Dataset/good_air/goodAir...,good_air
1628,/kaggle/input/hh25306/Dataset/good_air/goodAir...,good_air


In [27]:
# Create a physical holdout test set by copying a stratified sample to WORK_DIR/test_holdout
from sklearn.model_selection import train_test_split

HOLDOUT_SIZE = 0.15  # 15% to physical holdout
HOLDOUT_DIR = os.path.join(WORK_DIR, "test_holdout")
os.makedirs(HOLDOUT_DIR, exist_ok=True)

# Split indices for holdout (stratified)
df['label_id'] = df['label'].map(lambda x: x)  # temporary placeholder; will re-encode later
holdout_idx, keep_idx = train_test_split(
    np.arange(len(df)),
    test_size=1.0 - HOLDOUT_SIZE,
    stratify=df['label'],
    random_state=SEED
)
holdout_df = df.iloc[holdout_idx].copy().reset_index(drop=True)
keep_df = df.iloc[keep_idx].copy().reset_index(drop=True)

# Copy holdout files into class-named folders under HOLDOUT_DIR
for _, row in holdout_df.iterrows():
    src = row['filepath']
    cls = row['label']
    dst_dir = os.path.join(HOLDOUT_DIR, cls)
    os.makedirs(dst_dir, exist_ok=True)
    dst = os.path.join(dst_dir, os.path.basename(src))
    if not os.path.exists(dst):
        shutil.copy2(src, dst)

print(f"Physical holdout created at: {HOLDOUT_DIR}")
print("Holdout counts:\n", holdout_df['label'].value_counts().sort_index())
print("Remaining (for train/val) counts:\n", keep_df['label'].value_counts().sort_index())


Physical holdout created at: /kaggle/working/test_holdout
Holdout counts:
 label
Endangered_species    75
algal_bloom           39
clear_water           16
deforestation         32
forest_fire           41
good_air              48
no_deforestation      44
plastic               16
polluted_air          33
Name: count, dtype: int64
Remaining (for train/val) counts:
 label
Endangered_species    425
algal_bloom           218
clear_water            91
deforestation         180
forest_fire           232
good_air              275
no_deforestation      251
plastic                93
polluted_air          186
Name: count, dtype: int64


### 3) Encode Labels and Class Counts

In [28]:
# label encoding on remaining data (keep_df)
classes = sorted(keep_df['label'].unique())
class_to_id = {c:i for i,c in enumerate(classes)}
id_to_class = {i:c for c,i in class_to_id.items()}

keep_df['label_id'] = keep_df['label'].map(class_to_id)

# Class counts on remaining data
class_counts = keep_df['label'].value_counts().sort_index()
print("Class counts (train+val pool):\n", class_counts)

# Save mapping for inference
with open(os.path.join(WORK_DIR, "label_mapping.json"), "w") as f:
    json.dump({"class_to_id": class_to_id, "id_to_class": id_to_class}, f, indent=2)



Class counts (train+val pool):
 label
Endangered_species    425
algal_bloom           218
clear_water            91
deforestation         180
forest_fire           232
good_air              275
no_deforestation      251
plastic                93
polluted_air          186
Name: count, dtype: int64


### 4) Train/Val/Test Split (Stratified)

In [29]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.10   # internal programmatic test
VAL_SIZE = 0.15

trainval_df, test_df = train_test_split(
    keep_df, test_size=TEST_SIZE, stratify=keep_df['label_id'], random_state=SEED
)
val_rel = VAL_SIZE / (1.0 - TEST_SIZE)
train_df, val_df = train_test_split(
    trainval_df, test_size=val_rel, stratify=trainval_df['label_id'], random_state=SEED
)

for name, d in [("train", train_df), ("val", val_df), ("test", test_df)]:
    print(name, len(d))
    print(d['label'].value_counts().sort_index())


train 1462
label
Endangered_species    318
algal_bloom           163
clear_water            69
deforestation         135
forest_fire           174
good_air              206
no_deforestation      188
plastic                70
polluted_air          139
Name: count, dtype: int64
val 293
label
Endangered_species    64
algal_bloom           33
clear_water           13
deforestation         27
forest_fire           35
good_air              41
no_deforestation      38
plastic               14
polluted_air          28
Name: count, dtype: int64
test 196
label
Endangered_species    43
algal_bloom           22
clear_water            9
deforestation         18
forest_fire           23
good_air              28
no_deforestation      25
plastic                9
polluted_air          19
Name: count, dtype: int64


In [30]:
# On-disk augmentation: create augmented copies of training images
# Augs: horizontal flip, 90/180/270 rotations
from PIL import Image

AUG_TRAIN_DIR = os.path.join(WORK_DIR, "aug_train")
if os.path.exists(AUG_TRAIN_DIR):
    shutil.rmtree(AUG_TRAIN_DIR)
os.makedirs(AUG_TRAIN_DIR, exist_ok=True)

def augment_and_save(src_path, dst_dir, base_name):
    # Load
    img = Image.open(src_path).convert("RGB")
    aug_paths = []

    # 1) Horizontal flip
    img_h = img.transpose(Image.FLIP_LEFT_RIGHT)
    path_h = os.path.join(dst_dir, f"{base_name}_h.jpg")
    img_h.save(path_h, quality=95)
    aug_paths.append(path_h)

    # 2) Rotations 90, 180, 270
    for k, angle in enumerate([90, 180, 270]):
        img_r = img.rotate(angle, expand=True)
        path_r = os.path.join(dst_dir, f"{base_name}_r{angle}.jpg")
        img_r.save(path_r, quality=95)
        aug_paths.append(path_r)

    return aug_paths

aug_rows = []
for _, row in train_df.iterrows():
    src = row['filepath']
    cls = row['label']
    dst_dir = os.path.join(AUG_TRAIN_DIR, cls)
    os.makedirs(dst_dir, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(src))[0]
    new_paths = augment_and_save(src, dst_dir, base_name)
    for p in new_paths:
        aug_rows.append((p, cls))

aug_df = pd.DataFrame(aug_rows, columns=["filepath", "label"])
aug_df['label_id'] = aug_df['label'].map(class_to_id)

# Merge original train + augmented train
train_df_aug = pd.concat([train_df[['filepath','label','label_id']], aug_df], ignore_index=True)
print("Original train size:", len(train_df))
print("Augmented copies added:", len(aug_df))
print("Total train after on-disk aug:", len(train_df_aug))
print(train_df_aug['label'].value_counts().sort_index())


Original train size: 1462
Augmented copies added: 5848
Total train after on-disk aug: 7310
label
Endangered_species    1590
algal_bloom            815
clear_water            345
deforestation          675
forest_fire            870
good_air              1030
no_deforestation       940
plastic                350
polluted_air           695
Name: count, dtype: int64


### 6) Data Pipeline (tf.data) with Augmentations

In [31]:
IMG_SIZE = 224
BATCH_SIZE = 32
AUTO = tf.data.AUTOTUNE

# Augmentation pipeline
data_augment = keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
    layers.RandomContrast(0.1),
], name="augment")

def decode_img(path, label):
    img = tf.io.read_file(path)
    img = tf.io.decode_image(img, channels=3, expand_animations=False)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE), method="bilinear")
    img = tf.cast(img, tf.float32) / 255.0
    return img, label

def build_dataset(df, training=False):
    paths = df['filepath'].tolist()
    labels = df['label_id'].tolist()
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    ds = ds.shuffle(buffer_size=len(paths), seed=SEED) if training else ds
    ds = ds.map(lambda p,l: decode_img(p, l), num_parallel_calls=AUTO)
    if training:
        ds = ds.map(lambda x,y: (data_augment(x, training=True), y), num_parallel_calls=AUTO)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTO)
    return ds

val_ds = build_dataset(val_df, training=False)
test_ds = build_dataset(test_df, training=False)
train_ds = build_dataset(train_df_aug, training=True)

### 5) Bagging Sampler (Bootstrap per Class)

In [32]:
from collections import defaultdict

N_BAGS = 3  # adjust for runtime
TARGET_PER_CLASS = int(train_df_aug['label'].value_counts().max())  # baseline size
print("Target samples per class per bag:", TARGET_PER_CLASS)

def make_bootstrap_bag(train_df, target_per_class, seed):
    rng = np.random.default_rng(seed)
    bag_indices = []
    for cls, grp in train_df.groupby('label'):
        idxs = grp.index.to_numpy()
        # sample with replacement to target size; if class already large, cap at its size or mildly oversample
        size = max(target_per_class, min(len(grp), target_per_class))
        chosen = rng.choice(idxs, size=size, replace=True)
        bag_indices.extend(chosen.tolist())
    rng.shuffle(bag_indices)
    return train_df.loc[bag_indices].reset_index(drop=True)

bags = [ make_bootstrap_bag(train_df_aug, TARGET_PER_CLASS, SEED+i) for i in range(N_BAGS) ]
for i, b in enumerate(bags):
    print(f"Bag {i}: {len(b)} samples")


Target samples per class per bag: 1590
Bag 0: 14310 samples
Bag 1: 14310 samples
Bag 2: 14310 samples


### 7) Custom CNN Model

In [33]:
def conv_block(x, filters, k=3, s=1, p="same"):
    x = layers.Conv2D(filters, k, strides=s, padding=p, use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    return x

def se_block(x, r=8):
    # Squeeze-and-Excitation to improve channel attention
    c = x.shape[-1]
    se = layers.GlobalAveragePooling2D()(x)
    se = layers.Dense(c//r, activation="relu")(se)
    se = layers.Dense(c, activation="sigmoid")(se)
    se = layers.Reshape((1,1,c))(se)
    return layers.Multiply()([x, se])

def build_custom_cnn(input_shape=(IMG_SIZE, IMG_SIZE, 3), n_classes=len(classes)):
    inp = layers.Input(shape=input_shape)
    x = conv_block(inp, 32)
    x = conv_block(x, 32)
    x = layers.MaxPool2D()(x)

    x = conv_block(x, 64)
    x = conv_block(x, 64)
    x = se_block(x)
    x = layers.MaxPool2D()(x)

    x = conv_block(x, 128)
    x = conv_block(x, 128)
    x = se_block(x)
    x = layers.MaxPool2D()(x)

    x = conv_block(x, 192)
    x = conv_block(x, 192)
    x = se_block(x)
    x = layers.MaxPool2D()(x)

    x = layers.Conv2D(256, 1, use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.4)(x)
    out = layers.Dense(n_classes, activation="softmax")(x)

    model = keras.Model(inp, out)
    return model

model = build_custom_cnn()
model.summary()


### 8) Optimizer, Loss, Metrics, Class Weights

In [34]:
# Compute class weights from train_df (pre-bag) to emphasize minority classes
from sklearn.utils.class_weight import compute_class_weight

y_train_ids = train_df['label_id'].values
class_weights_arr = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(len(classes)),
    y=y_train_ids
)
class_weights = {i: float(w) for i, w in enumerate(class_weights_arr)}
print("Class weights:", class_weights)

LR = 1e-3
EPOCHS = 30

def compile_model():
    m = build_custom_cnn()
    m.compile(
        optimizer=keras.optimizers.Adam(learning_rate=LR),
        loss="sparse_categorical_crossentropy",   # correct for integer labels
        metrics=[
            "accuracy",
            keras.metrics.SparseTopKCategoricalAccuracy(k=3, name="top3_acc")
        ],
    )
    return m


# Callbacks
ckpt_path = os.path.join(WORK_DIR, "best_model.keras")
callbacks = [
    keras.callbacks.ModelCheckpoint(
        ckpt_path, monitor="val_accuracy", save_best_only=True, mode="max", verbose=1
    ),
    keras.callbacks.EarlyStopping(
        monitor="val_accuracy", patience=7, restore_best_weights=True, mode="max", verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=3, min_lr=1e-5, verbose=1
    ),
]


Class weights: {0: 0.5108315863032844, 1: 0.9965916837082481, 2: 2.3542673107890497, 3: 1.2032921810699588, 4: 0.933588761174968, 5: 0.7885652642934197, 6: 0.8640661938534279, 7: 2.320634920634921, 8: 1.1686650679456434}


### 9) Train With Bagging (Ensemble)

In [36]:
bag_models = []
histories = []

for i, bag_df in enumerate(bags):
    print(f"\n=== Training bag {i+1}/{N_BAGS} ===")
    train_ds = build_dataset(bag_df, training=True)

    m = compile_model()
    h = m.fit(
        train_ds,
        validation_data=val_ds,
        epochs=EPOCHS,
        class_weight=class_weights,
        callbacks=callbacks,
        verbose=2
    )
    histories.append(h.history)

    # Save this bag model separately too (optional)
    bag_path = os.path.join(WORK_DIR, f"bag_{i}_final.keras")
    m.save(bag_path)
    bag_models.append(m)

print("Best checkpoint saved to:", ckpt_path)



=== Training bag 1/3 ===
Epoch 1/30

Epoch 1: val_accuracy did not improve from 0.52901
448/448 - 152s - 340ms/step - accuracy: 0.5435 - loss: 1.3737 - top3_acc: 0.8252 - val_accuracy: 0.5051 - val_loss: 1.7463 - val_top3_acc: 0.7338 - learning_rate: 0.0010
Epoch 2/30

Epoch 2: val_accuracy did not improve from 0.52901
448/448 - 127s - 285ms/step - accuracy: 0.6309 - loss: 1.0217 - top3_acc: 0.8971 - val_accuracy: 0.4744 - val_loss: 1.9317 - val_top3_acc: 0.7201 - learning_rate: 0.0010
Epoch 3/30

Epoch 3: val_accuracy improved from 0.52901 to 0.55290, saving model to /kaggle/working/best_model.keras
448/448 - 130s - 290ms/step - accuracy: 0.6707 - loss: 0.8924 - top3_acc: 0.9157 - val_accuracy: 0.5529 - val_loss: 1.6455 - val_top3_acc: 0.8225 - learning_rate: 0.0010
Epoch 4/30

Epoch 4: val_accuracy improved from 0.55290 to 0.59386, saving model to /kaggle/working/best_model.keras
448/448 - 129s - 288ms/step - accuracy: 0.7133 - loss: 0.7644 - top3_acc: 0.9312 - val_accuracy: 0.5939 

## 10) Evaluate on Test Set (Single Best Checkpoint + Ensemble)

In [37]:
# Load best single model by val_accuracy
best_model = keras.models.load_model(ckpt_path)
single_eval = best_model.evaluate(test_ds, verbose=0)
print("Best single model test metrics:", dict(zip(best_model.metrics_names, single_eval)))

# Ensemble predictions (average softmax across bag models)
def predict_ds_softmax(models, ds):
    all_probs = []
    for m in models:
        probs = m.predict(ds, verbose=0)
        all_probs.append(probs)
    mean_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
    return mean_probs

# For fair comparison, reload models to ensure same architecture if needed
# Here bag_models are already trained in-session
ensemble_probs = predict_ds_softmax(bag_models, test_ds)

# Build y_true from test_ds order
y_true = test_df['label_id'].to_numpy()
y_pred = ensemble_probs.argmax(axis=1)
ensemble_acc = (y_true == y_pred).mean()
print(f"Ensemble test accuracy: {ensemble_acc:.4f}")

# Optional: classification report & confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred, target_names=classes, digits=4))
cm = confusion_matrix(y_true, y_pred)
cm


Best single model test metrics: {'loss': 1.3130064010620117, 'compile_metrics': 0.7602040767669678}
Ensemble test accuracy: 0.7653
                    precision    recall  f1-score   support

Endangered_species     0.8158    0.7209    0.7654        43
       algal_bloom     0.6250    0.4545    0.5263        22
       clear_water     0.7273    0.8889    0.8000         9
     deforestation     0.7692    0.5556    0.6452        18
       forest_fire     0.8750    0.9130    0.8936        23
          good_air     0.8000    1.0000    0.8889        28
  no_deforestation     0.5758    0.7600    0.6552        25
           plastic     0.8750    0.7778    0.8235         9
      polluted_air     0.8889    0.8421    0.8649        19

          accuracy                         0.7653       196
         macro avg     0.7724    0.7681    0.7626       196
      weighted avg     0.7699    0.7653    0.7601       196



array([[31,  2,  0,  2,  2,  0,  4,  0,  2],
       [ 5, 10,  2,  0,  0,  2,  3,  0,  0],
       [ 0,  0,  8,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  0, 10,  1,  0,  7,  0,  0],
       [ 0,  1,  0,  0, 21,  0,  0,  1,  0],
       [ 0,  0,  0,  0,  0, 28,  0,  0,  0],
       [ 1,  3,  0,  1,  0,  1, 19,  0,  0],
       [ 0,  0,  1,  0,  0,  1,  0,  7,  0],
       [ 1,  0,  0,  0,  0,  2,  0,  0, 16]])

## 11) Save the Best Model (.keras) and Label Map

In [38]:
# best_model already saved at ckpt_path during training
# ensure mapping is with it
print("Saved model:", ckpt_path)
print("Label mapping saved at:", os.path.join(WORK_DIR, "label_mapping.json"))


Saved model: /kaggle/working/best_model.keras
Label mapping saved at: /kaggle/working/label_mapping.json


## 12) Inference: Single Image or Folder of Images

In [39]:
def load_best_model_and_mapping(model_path=ckpt_path, mapping_path=os.path.join(WORK_DIR, "label_mapping.json")):
    model = keras.models.load_model(model_path)
    with open(mapping_path, "r") as f:
        mapping = json.load(f)
    id_to_class = {int(k): v for k, v in mapping["id_to_class"].items()}
    return model, id_to_class

def preprocess_image_for_inference(path, img_size=IMG_SIZE):
    img = tf.io.read_file(path)
    img = tf.io.decode_image(img, channels=3, expand_animations=False)
    img = tf.image.resize(img, (img_size, img_size))
    img = tf.cast(img, tf.float32) / 255.0
    return img

def predict_paths(paths, topk=3):
    model, id_to_class = load_best_model_and_mapping()
    batch = tf.stack([preprocess_image_for_inference(p) for p in paths], axis=0)
    probs = model.predict(batch, verbose=0)
    preds = probs.argmax(axis=1)
    out = []
    for i, p in enumerate(paths):
        pred_id = int(preds[i])
        pred_cls = id_to_class[pred_id]
        pred_prob = float(probs[i, pred_id])
        # top-k
        top_idx = probs[i].argsort()[-topk:][::-1]
        top_items = [(id_to_class[int(j)], float(probs[i, j])) for j in top_idx]
        out.append({
            "path": p,
            "pred_class": pred_cls,
            "pred_prob": pred_prob,
            "topk": top_items
        })
    return out




In [42]:
# Usage examples:
# 1) Single image
test_image_path = "/kaggle/input/testdataa1/plastic3.jpg"
print(predict_paths([test_image_path]))

# 2) All images in a folder
# folder = "/kaggle/input/testdataa1"
# paths = [str(p) for p in Path(folder).glob("*") if p.suffix.lower() in IMG_EXTS]
# results = predict_paths(paths)
# for r in results[:5]:
#     print(r)

[{'path': '/kaggle/input/testdataa1/plastic3.jpg', 'pred_class': 'plastic', 'pred_prob': 0.9999961853027344, 'topk': [('plastic', 0.9999961853027344), ('Endangered_species', 2.1361727249313844e-06), ('algal_bloom', 1.5767458307891502e-06)]}]
