In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import sys
import time
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Suppress TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras import layers, regularizers
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import Sequence

# --- 0. GOOGLE COLAB SETUP & PATHS (CRITICAL) ---

# Set the path to the Colab content directory
DATA_PATH = '/content'
CSV_PATH = os.path.join(DATA_PATH, 'train.csv')
IMG_DIR = os.path.join(DATA_PATH, 'train/')

print("--- Colab Setup: Data Upload Instructions ---")

print("\n!!! CRITICAL ACTION REQUIRED: UPLOAD DATA !!!")
print("To run this successfully, you must manually upload the following files:")
print("1. **Upload `train.csv`**: Upload the file directly to the `/content` folder using the Colab file panel (left sidebar).")
print("2. **Upload Images**: Create a subfolder named `train` inside `/content` and upload a very small subset (around 100MB) of images into it. You MUST preserve the original nested subfolder structure (e.g., /content/train/0/0/0/...) for the image paths to work.")
print("--------------------------------------------------------------------------")

# 1. Check for CSV file existence. This is the absolute requirement to proceed.
if not os.path.exists(CSV_PATH):
    print(f"\n--- CRITICAL ERROR: FILE NOT FOUND ---")
    print(f"File: {CSV_PATH}")
    print(f"Action: Please ensure 'train.csv' has been successfully uploaded to the '/content' directory.")
    sys.exit(1)
else:
    print(f"\nFile {os.path.basename(CSV_PATH)} found. Proceeding with data loading and filtering.")

# 2. Ensure the image directory exists
if not os.path.exists(IMG_DIR):
    os.makedirs(IMG_DIR)

print("\n--- Setup Check Complete ---")

# --- 1. GLOBAL CONFIGURATION ---
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
IMG_SIZE = 128  # Consistent image size for training
BATCH_SIZE = 32
EPOCHS = 15
WEIGHT_DECAY = 0.0001
DROPOUT_RATE = 0.5
LEARNING_RATE = 1e-4

# --- MINIMUM DATASET SETTINGS (FOR MINIMAL DATA/MEMORY USE) ---
# This filter ensures the memory footprint is minimal and training is fast.
MIN_IMAGES_PER_CLASS = 10  # Only include landmarks with at least 10 images
MAX_CLASSES = 5            # Limit to the top 5 most frequent, suitable classes
# ----------------------------------------------------------------

# --- 2. CUSTOM DATA GENERATOR (Memory Efficient) ---

class LandmarkDataGenerator(Sequence):
    """
    Keras Sequence object to efficiently generate batches of images from disk.
    Crucial for large datasets as it avoids loading all images into memory.
    """
    def __init__(self, df, img_size, batch_size, class_list, augmentation=None, shuffle=True):
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.augmentation = augmentation
        self.shuffle = shuffle
        self.class_list = class_list
        self.on_epoch_end()

    def __len__(self):
        # Number of batches per epoch
        return int(np.floor(len(self.df) / self.batch_size))

    def on_epoch_end(self):
        # Updates indexes after each epoch
        self.indexes = np.arange(len(self.df))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def img_read_resize(self, img_path):
        """Reads, resizes, and converts image to RGB format."""
        try:
            img = cv2.imread(img_path)
            # If the image is not found (due to incomplete subset upload), return a black image
            if img is None:
                return np.zeros((self.img_size, self.img_size, 3), dtype=np.float32)

            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img_redim = cv2.resize(img_rgb, (self.img_size, self.img_size))
            return img_redim.astype(np.float32) / 255.0 # Normalize immediately
        except Exception:
            # Handle general processing errors by returning a black image
            return np.zeros((self.img_size, self.img_size, 3), dtype=np.float32)

    def __getitem__(self, index):
        # Generate one batch of data
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        batch_df = self.df.iloc[indexes]

        X_batch = np.empty((self.batch_size, self.img_size, self.img_size, 3), dtype=np.float32)
        y_batch = np.empty((self.batch_size), dtype=np.int32)

        for i, row in enumerate(batch_df.itertuples()):
            # Load and preprocess image
            X_batch[i,] = self.img_read_resize(row.img_path)
            y_batch[i] = row.class_idx

        # Apply augmentation only to training data
        if self.augmentation and len(X_batch) > 0:
             # Augmentation expects a TensorFlow tensor/dataset, so we wrap it
             X_batch = self.augmentation(X_batch)

        return X_batch, y_batch


# --- 3. DATA LOADING & PREPROCESSING (DataFrame only) ---

def load_traindf(csv_path, img_dir):
    """
    Loads the training dataframe and applies the strict size filtering.
    """
    try:
        traindf = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"\nFATAL ERROR: CSV not found at {csv_path}. Did you upload it?")
        sys.exit(1)

    traindf['landmark_id'] = traindf['landmark_id'].apply(np.int32)

    # 1. Filter 1: Remove rare classes
    counts = traindf['landmark_id'].value_counts()
    frequent_landmarks = counts[counts >= MIN_IMAGES_PER_CLASS].index
    traindf = traindf[traindf['landmark_id'].isin(frequent_landmarks)]

    # 2. Filter 2: Select only the top MAX_CLASSES
    if len(frequent_landmarks) > MAX_CLASSES:
        top_landmarks = traindf['landmark_id'].value_counts().nlargest(MAX_CLASSES).index
        traindf = traindf[traindf['landmark_id'].isin(top_landmarks)]

    traindf = traindf.reset_index(drop=True)

    # 3. Construct the full image path
    traindf['img_path'] = traindf['id'].apply(
        lambda x: os.path.join(img_dir, x[0], x[1], x[2], x + '.jpg')
    )

    # 4. Map class IDs to sequential integers (0, 1, 2...)
    class_list = sorted(traindf['landmark_id'].unique().tolist())
    class_to_idx = {cls: idx for idx, cls in enumerate(class_list)}
    traindf['class_idx'] = traindf['landmark_id'].map(class_to_idx)

    print(f"--- Dataset Filtering Applied ---")
    print(f"Minimum images per class: {MIN_IMAGES_PER_CLASS}")
    print(f"Maximum classes selected: {MAX_CLASSES}")
    print(f"Final reduced dataset size: {len(traindf)} images across {len(class_list)} classes.")
    return traindf, class_list


print("--- Starting Data Loading & Filtering ---")
traindf, class_list = load_traindf(CSV_PATH, IMG_DIR)
M_CLASS = len(class_list)

# --- 4. DATA SPLITTING & GENERATOR SETUP ---

# Split the dataframe itself (no images loaded yet)
train_df, test_df = train_test_split(
    traindf, test_size=0.10, random_state=SEED, shuffle=True, stratify=traindf['class_idx']
)

train_df, val_df = train_test_split(
    train_df, test_size=0.1/0.9, random_state=SEED, shuffle=True, stratify=train_df['class_idx']
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


print("\n--- Data Split Summary (DataFrames) ---")
print(f"Training Samples: {len(train_df)}")
print(f"Validation Samples: {len(val_df)}")
print(f"Testing Samples: {len(test_df)}")


# --- 5. DATA AUGMENTATION & GENERATOR INSTANTIATION ---

data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal", seed=SEED),
    layers.RandomZoom(height_factor=(-0.2, 0.2), width_factor=(-0.2, 0.2), seed=SEED),
    layers.RandomTranslation(height_factor=(-0.1, 0.1), width_factor=(-0.1, 0.1), seed=SEED),
    layers.RandomRotation(factor=(-0.1, 0.1), seed=SEED),
    layers.RandomContrast(0.4, seed=SEED),
    layers.RandomCrop(IMG_SIZE, IMG_SIZE, seed=SEED)
], name="data_augmentation")

# Create Generators
train_generator = LandmarkDataGenerator(
    train_df, IMG_SIZE, BATCH_SIZE, class_list,
    augmentation=data_augmentation, shuffle=True
)
val_generator = LandmarkDataGenerator(
    val_df, IMG_SIZE, BATCH_SIZE, class_list,
    augmentation=None, shuffle=False
)
test_generator = LandmarkDataGenerator(
    test_df, IMG_SIZE, BATCH_SIZE, class_list,
    augmentation=None, shuffle=False
)


# --- 6. MODEL DEFINITION (TRANSFER LEARNING) ---

print("\n--- Defining Model: ResNet50V2 with Custom Head ---")

base_model = ResNet50V2(
    weights='imagenet',
    include_top=False,
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)

base_model.trainable = False # Start by freezing the base

model_head = tf.keras.Sequential([
    layers.Flatten(),
    layers.Dense(
        512,
        activation='relu',
        kernel_regularizer=regularizers.l2(WEIGHT_DECAY)
    ),
    layers.Dropout(rate=DROPOUT_RATE, seed=SEED),
    layers.BatchNormalization(),
    layers.Dense(M_CLASS, activation='softmax')
])

model = tf.keras.Sequential([
    base_model,
    model_head
], name="Landmark_Recognizer_ResNet50V2")

model.summary()

# --- 7. OPTIMIZER, COMPILATION, AND CALLBACKS ---

optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

model.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy']
)

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=3, min_delta=0.0001, verbose=1, min_lr=1e-7)
# FIX: Changed filepath to end in .weights.h5 as required by save_weights_only=True
checkpoint = ModelCheckpoint('best-weights.weights.h5', monitor='val_loss', save_best_only=True, save_weights_only=True)
callbacks_list = [early_stopping, reduce_lr, checkpoint]


# --- 8. TRAINING ---
print("\n--- Starting Model Training ---")
train_start_time = time.time()

# Training using the memory-efficient generators
history = model.fit(
    train_generator,
    epochs=EPOCHS,
    validation_data=val_generator,
    callbacks=callbacks_list,
    verbose=1,
)

train_end_time = time.time()
training_duration = (train_end_time - train_start_time) / 60
print(f"\nTraining duration: {training_duration:.2f} minutes")

# --- 9. EVALUATION & RESULTS ---

# Load the best weights
if os.path.exists('best-weights.weights.h5'):
    model.load_weights('best-weights.weights.h5')
else:
    print("Warning: Checkpoint file not found. Using final epoch weights.")

print("\n--- Model Evaluation with Testing Data ---")
loss, accuracy = model.evaluate(test_generator, verbose=1)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


# 9.1. Plotting Performance
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss vs. Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['sparse_categorical_accuracy'], label='Training Accuracy')
plt.plot(history.history['val_sparse_categorical_accuracy'], label='Validation Accuracy')
plt.title('Accuracy vs. Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# 9.2. Classification Report (Precision, Recall, F1-Score)
# Get predictions from the test generator
y_pred_probs = model.predict(test_generator)
y_pred = np.argmax(y_pred_probs, axis=1)

# Extract true labels from the generator
y_true = np.concatenate([test_generator[i][1] for i in range(len(test_generator))])

print("\n--- Classification Report ---")
report_classes = [str(cls) for cls in class_list]
print(classification_report(y_true, y_pred, target_names=report_classes, zero_division=0))

# 9.3. Visual Results (Showing sample predictions)
def plot_predictions(generator, model, num_samples=9):
    plt.figure(figsize=(15, 15))

    # Get one batch from the generator
    if len(generator) == 0:
        print("Cannot plot predictions: Test generator is empty.")
        return

    X_batch, y_true_batch = generator[np.random.randint(0, len(generator))]

    # Predict on the batch
    y_pred_probs = model.predict(X_batch, verbose=0)
    y_pred_batch = np.argmax(y_pred_probs, axis=1)

    for i in range(min(num_samples, len(X_batch))):

        true_label_idx = y_true_batch[i]
        predicted_label_idx = y_pred_batch[i]

        # Ensure indices are within bounds of class_list
        if true_label_idx < len(class_list) and predicted_label_idx < len(class_list):
            true_id = class_list[true_label_idx]
            predicted_id = class_list[predicted_label_idx]
        else:
            true_id = "N/A"
            predicted_id = "N/A"

        color = 'green' if true_id == predicted_id else 'red'

        plt.subplot(3, 3, i + 1)
        # Denormalize image for display
        plt.imshow((X_batch[i] * 255).astype(np.uint8))
        plt.title(f'True ID: {true_id}\nPred ID: {predicted_id}', color=color, fontsize=10)
        plt.xticks([]); plt.yticks([])

    plt.suptitle('Sample Predictions on Test Data', fontsize=18, y=0.92)
    plt.show()

print("\n--- Visualizing Sample Predictions ---")
plot_predictions(test_generator, model, num_samples=9)

print("\nProject execution finished successfully.")

--- Colab Setup: Data Upload Instructions ---

!!! CRITICAL ACTION REQUIRED: UPLOAD DATA !!!
To run this successfully, you must manually upload the following files:
1. **Upload `train.csv`**: Upload the file directly to the `/content` folder using the Colab file panel (left sidebar).
2. **Upload Images**: Create a subfolder named `train` inside `/content` and upload a very small subset (around 100MB) of images into it. You MUST preserve the original nested subfolder structure (e.g., /content/train/0/0/0/...) for the image paths to work.
--------------------------------------------------------------------------

File train.csv found. Proceeding with data loading and filtering.

--- Setup Check Complete ---
--- Starting Data Loading & Filtering ---
--- Dataset Filtering Applied ---
Minimum images per class: 10
Maximum classes selected: 5
Final reduced dataset size: 13137 images across 5 classes.

--- Data Split Summary (DataFrames) ---
Training Samples: 10509
Validation Samples: 1314
Tes


--- Starting Model Training ---


  self._warn_if_super_not_called()


Epoch 1/15
[1m 85/328[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m6:29[0m 2s/step - loss: 1.9220 - sparse_categorical_accuracy: 0.2067