### Libraries

In [None]:
# Import Data Science Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import itertools
import random

# Import visualization libraries
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import cv2
import seaborn as sns

# Tensorflow Libraries
from tensorflow import keras
from tensorflow.keras import layers,models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import Callback, EarlyStopping,ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam

# System libraries
from pathlib import Path
import os.path

# Metrics
from sklearn.metrics import classification_report, confusion_matrix

sns.set_style('darkgrid')

In [7]:
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ["TF_DETERMINISTIC_OPS"] = "1"

    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # TF >= 2.9 (best effort)
    try:
        tf.keras.utils.set_random_seed(seed)
        tf.config.experimental.enable_op_determinism()
    except Exception:
        pass

seed_everything(42)

### Helper functions

In [10]:
import urllib.request

url = "https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py"
urllib.request.urlretrieve(url, "helper_functions.py")

# now import
from helper_functions import (
    create_tensorboard_callback, plot_loss_curves, unzip_data,
    compare_historys, walk_through_dir, pred_and_plot
)
print("helper_functions.py downloaded and imported")


helper_functions.py downloaded and imported


### Load and Transform Data

In [11]:
BATCH_SIZE = 32
TARGET_SIZE = (224, 224)

In [17]:
import kagglehub
from pathlib import Path
import pandas as pd

dataset_path = Path(kagglehub.dataset_download("jxwleong/coral-reef-dataset"))
csv_path = dataset_path / "combined_annotations_remapped.csv"

# Try common encodings
for enc in ["utf-8", "utf-8-sig", "cp1252", "latin1"]:
    try:
        df = pd.read_csv(csv_path, encoding=enc)
        print("✅ Loaded with encoding:", enc)
        break
    except UnicodeDecodeError:
        pass

df.head()


Downloading to C:\Users\user\.cache\kagglehub\datasets\jxwleong\coral-reef-dataset\2.archive...


100%|██████████| 18.3G/18.3G [15:40<00:00, 20.9MB/s]  

Extracting files...





✅ Loaded with encoding: utf-8


Unnamed: 0,Name,Row,Column,Label,Unnamed: 4
0,i0201a.png,111,94,broken_coral_rubble,
1,i0201a.png,173,243,broken_coral_rubble,
2,i0201a.png,84,366,broken_coral_rubble,
3,i0201a.png,54,802,broken_coral_rubble,
4,i0201a.png,313,66,sand,


In [18]:
print(df.columns)
print(df.head())
print(df["Label"].nunique())


Index(['Name', 'Row', 'Column', 'Label', 'Unnamed: 4'], dtype='object')
         Name  Row  Column                Label  Unnamed: 4
0  i0201a.png  111      94  broken_coral_rubble         NaN
1  i0201a.png  173     243  broken_coral_rubble         NaN
2  i0201a.png   84     366  broken_coral_rubble         NaN
3  i0201a.png   54     802  broken_coral_rubble         NaN
4  i0201a.png  313      66                 sand         NaN
40


Clean the CSV (drop junk + normalize labels)

In [19]:
# df is already loaded
df = df.drop(columns=["Unnamed: 4"], errors="ignore")

# normalize label text (keep underscores)
df["Label"] = df["Label"].astype(str).str.strip().str.lower()

print("Unique labels:", df["Label"].nunique())
print(df["Label"].value_counts().head(15))

Unique labels: 40
Label
crustose_coralline_algae    226017
turf                         43769
sand                         38880
porites                      35236
macroalgae                   23832
off                          13605
pocillopora                  11319
montipora                     8755
pavona                        5806
acropora                      3458
hard_substrate                2086
millepora                     1459
broken_coral_rubble           1025
montastraea                    645
leptastrea                     528
Name: count, dtype: int64


Remove “bad/invalid” labels

In [20]:
bad_labels = {"bad", "dark", "off", "unknown", "unlabeled", "background", "water"}
df = df[~df["Label"].isin(bad_labels)]

print("After removing invalid labels:")
print("Unique labels:", df["Label"].nunique())


After removing invalid labels:
Unique labels: 37


Choose Top-N classes

In [21]:
TOP_N = 5  # change to 3, 4, 5, or 6
top_labels = df["Label"].value_counts().head(TOP_N).index.tolist()
print("Top labels:", top_labels)

df_top = df[df["Label"].isin(top_labels)].copy()


Top labels: ['crustose_coralline_algae', 'turf', 'sand', 'porites', 'macroalgae']


Convert point-annotations → image-level label
This creates one label per image, which is what VGG19 needs.

In [22]:
image_labels = (
    df_top.groupby("Name")["Label"]
          .agg(lambda s: s.value_counts().idxmax())
          .reset_index()
          .rename(columns={"Name": "filename", "Label": "label"})
)

print("Image-level samples:")
print(image_labels.head())
print("Image-level class counts:")
print(image_labels["label"].value_counts())


Image-level samples:
     filename label
0  i0201a.png  sand
1  i0201d.png  sand
2  i0202b.png  sand
3  i0202d.png  sand
4  i0203a.png  sand
Image-level class counts:
label
crustose_coralline_algae    1515
sand                         330
porites                      220
turf                         137
macroalgae                    23
Name: count, dtype: int64


Remove “uncertain” images

If an image has mixed labels, majority vote might be weak.
Keep only images where the majority class is strong (e.g., ≥60%).

In [23]:
def majority_ratio(s):
    return s.value_counts(normalize=True).iloc[0]

ratios = df_top.groupby("Name")["Label"].apply(majority_ratio).reset_index(name="majority_ratio")

image_labels = image_labels.merge(ratios, left_on="filename", right_on="Name", how="left").drop(columns=["Name"])

THRESH = 0.6
image_labels_clean = image_labels[image_labels["majority_ratio"] >= THRESH].drop(columns=["majority_ratio"])

print("Before filter:", len(image_labels), "After filter:", len(image_labels_clean))
print(image_labels_clean["label"].value_counts())


Before filter: 2225 After filter: 1672
label
crustose_coralline_algae    1197
sand                         257
porites                      156
turf                          58
macroalgae                     4
Name: count, dtype: int64


In [24]:
print(dataset_path)


C:\Users\user\.cache\kagglehub\datasets\jxwleong\coral-reef-dataset\versions\2
