# Environment Set Up

In [1]:
import os
from dotenv import load_dotenv

# Loading environment variables from .env
load_dotenv()

# Changing directory to main directory for easy data access
working_directory = os.getenv("WORKING_DIRECTORY")
os.chdir(working_directory)

# Checking the change
%pwd

'/workspaces/TumorTracer'

In [2]:
from pathlib import Path

# Checking the change
print("Git folder exists:", Path(".git").exists())

Git folder exists: True


# 3. Model Training

In [3]:
from dataclasses import dataclass
from pathlib import Path
from cnnClassifier import get_logger
from typing import Optional, Dict, Any

# Initializing the logger
logger = get_logger()

@dataclass(frozen=True)
class ModelTrainingConfig:
    """
    Immutable configuration class to store all parameters 
    and paths required for model training. 
    """
    root_dir: Path                                          # Directory for training artifacts
    trained_model_path: Path                                # Final model output path
    updated_base_model: Path                                # Pretrained model with custom head
    training_data: Path                                     # Directory with training images
    validation_data: Path                                   # Directory with validation images
    params_augmentation: bool                               # Whether to apply augmentation
    params_checkpoint: bool                                 # Whether created models need to be checkpointed
    params_image_size: tuple[int, int, int]                 # Input image size, e.g., [224, 224, 3]
    params_batch_size: int                                  # Batch size for training
    params_epochs: int                                      # Total epochs
    params_optimizer: str                                   # Optimizer to be used when recompling model
    params_learning_rate: float                             # Learning rate for training
    params_if_augmentation: Optional[Dict[str, Any]] = None # Dict of augmentation hyperparameters

In [4]:
from cnnClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from cnnClassifier.utils.common import read_yaml, create_directories
from cnnClassifier import get_logger

# Initializing the logger
logger = get_logger()

class ConfigurationManager:
    def __init__(self, config_file_path=CONFIG_FILE_PATH, params_file_path=PARAMS_FILE_PATH) -> None:
        """
        Reads configuration files (config.yaml and params.yaml), 
        ensures necessary directories exist, and prepares structured config objects.

        Args:
        - config_file_path (str): Path to the config.yaml file.
        - params_file_path (str): Path to the params.yaml file.
        """
        # Validate and load config.yaml
        if not Path(config_file_path).exists():
            logger.error(f"Config file not found at: {config_file_path}")
            raise FileNotFoundError(f"Config file not found at: {config_file_path}")
        self.config = read_yaml(config_file_path)

        # Validate and load params.yaml
        if not Path(config_file_path).exists():
            logger.error(f"Params file not found at: {params_file_path}")
            raise FileNotFoundError(f"Params file not found at: {params_file_path}")
        self.params = read_yaml(params_file_path)

        logger.info(f"Loading configuration from {config_file_path} and parameters from {params_file_path}")

        # Create the root artifacts directory (if not already present)
        create_directories([self.config.artifacts_root])


    def get_training_config(self) -> ModelTrainingConfig:
        """
        Prepares and returns the ModelTrainingConfig object.

        Returns:
        - ModelTrainingConfig: Structured config for training the updated base model.
        """
        training_config = self.config.model_training
        training_params = self.params.model_training

        # Ensure the data_ingestion directory exists
        create_directories([training_config.root_dir])

        # Load augmentation params only if augmentation is enabled and params for it are present
        params_for_augmentation = {}
        if training_params.AUGMENTATION and hasattr(training_params, "AUGMENTATION_PARAMS"):
            params_for_augmentation = dict(training_params.AUGMENTATION_PARAMS )

        training_config = ModelTrainingConfig(
            root_dir=Path(training_config.root_dir),
            trained_model_path=Path(training_config.trained_model_path),
            updated_base_model=Path(training_config.updated_model_path),
            training_data=Path(training_config.training_dataset),
            validation_data=Path(training_config.validation_dataset),
            params_augmentation=training_params.AUGMENTATION,
            params_checkpoint=training_params.CHECKPOINT,
            params_image_size=tuple(training_params.IMAGE_SIZE),
            params_batch_size=training_params.BATCH_SIZE,
            params_epochs=training_params.EPOCHS,
            params_optimizer=training_params.OPTIMIZER,
            params_learning_rate=training_params.LEARNING_RATE,
            params_if_augmentation=params_for_augmentation,
        )

        logger.info(f"ModelTrainingConfig created with: {training_config}")

        return training_config

In [None]:
import os
import random
import numpy as np
import tensorflow as tf
from math import ceil
from typing import Union
from tensorflow.keras.preprocessing.image import ImageDataGenerator, DirectoryIterator
from pathlib import Path
from dataclasses import asdict
from datetime import datetime

from cnnClassifier.utils.common import create_directories, save_json
from cnnClassifier import get_logger

# Initializing the logger
logger = get_logger()

class ModelTraining:
    """
    Initializes training pipeline with given configuration.

    Core Responsibilities:
    - Load a pre-defined base model from disk and recompile it with a fresh optimizer.
    - Set up data generators for training and validation, with optional augmentation.
    - Train the model across multiple epochs with optional checkpointing.
    - Resume training from where it left off.
    - Save class label mappings and model artifacts.

    Public Methods:
    - get_base_model(): Load and compile the pre-trained base model.
    - get_data_generators(): Prepare train and validation data generators.
    - train(): Train the model with checkpointing on best validation accuracy.
    - resume_train(add_epochs): Continue training the model for additional epochs.
    - save_class_indices(): Save class-to-index mapping as JSON for reproducibility.

    Private Utilities:
    - _build_generator(): Helper to construct data generators with standard settings.
    - _create_checkpoint(): Creates a checkpoint directory and stores training metadata.
    - _get_optimizer(): Returns optimizer based on config.
    - _count_images_in_directory(): Utility to count image files recursively.
    - _save_model(): Saves the model to disk at the given path.
    """
    def __init__(self, config: ModelTrainingConfig) -> None:
        """
        Initializes the model training pipeline.

        - Sets random seeds for reproducibility.
        - Prepares internal attributes for managing training, checkpoints, and model state.
        """
        # Store configuration
        self.config = config

        # Set random seeds
        seed = self.config.params_seed if hasattr(self.config, "params_seed") else 1234
        tf.random.set_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

        # Initialize model and training attributes
        self.output_model = None
        self.training_generator = None
        self.valid_generator = None
        self.training_images = None
        self.validation_images = None
        self.last_epoch = 0
        self.additional_epochs = 0
        self.best_val_accuracy = 0

        # Initialize checkpoint directory path with timestamp
        curr_time = datetime.now().strftime("%Y%m%d_%H%M")
        self.checkpoint_path = Path(self.config.root_dir / f"Checkpoint_{curr_time}")


    def get_base_model(self) -> None:
        """
        Loads the base model form specified path.
        """
        model_path = Path(self.config.updated_base_model)

        if not model_path.exists():
            logger.error(f"Could not find model at {model_path}. Run the Base Model pipeline stage first.")
            raise FileNotFoundError(f"Could not find model at {model_path}. Run the Base Model pipeline stage first.")
        
        try:
            self.output_model = tf.keras.models.load_model(model_path)
            logger.info(f"Successfully loaded the base model from {model_path}.")

            # Enabling Eager Execution (optional in TF 2.x) due to library requirement
            tf.config.run_functions_eagerly(True)

            # Recompile model with a fresh optimizer (required after loading)
            self.output_model.compile(
                optimizer=self._get_optimizer(),
                loss=tf.keras.losses.CategoricalCrossentropy(),
                metrics=["accuracy"]
            )
            logger.info(f"Successfully recomplied the model.")

        except Exception as exception_error:
            logger.error(f"Unexpected error while loading the update base model at {model_path}: {exception_error}")
            raise 
            
    
    def get_data_generators(self) -> None:
        """
        Update train and validation data generators using ImageDataGenerator.
        Applies augmentation only on training data if enabled.
        """
        try:
            logger.info("Preparing ImageDataGenerators...")

            train_datagen = ImageDataGenerator(rescale=1.0/255, **self.config.params_if_augmentation)
            valid_datagen = ImageDataGenerator(rescale=1.0/255)

            self.training_generator = self._build_generator(train_datagen, self.config.training_data, "Train")
            self.valid_generator = self._build_generator(valid_datagen, self.config.validation_data, "Valid")

            # Ensure class-to-index mapping is consistent
            if self.training_generator.class_indices != self.valid_generator.class_indices:
                logger.error("Mismatch in class indices between train and validation generators!")
                raise ValueError("Mismatch in class indices between train and validation generators!")

            logger.info("ImageDataGenerators created successfully.")
        
        except Exception as exception_error:
            logger.error(f"Unexpected error while creating data generators: {exception_error}")
            raise


    def train(self) -> None:
        """
        Trains the model using prepared generators.
        """
        if self.output_model == None:
            logger.error("Base model not found. Run get_base_model() before calling train().")
            raise ValueError("Base model not found. Run get_base_model() before calling train().")
        
        if (self.config.params_checkpoint) and (not self.checkpoint_path.exists()) and (self.config.params_epochs >= 1):
            self._create_checkpoint()
        
        try:
            logger.info("Initializing model training...")

            # Counting the images in each of the datasets
            self.training_images = self._count_images_in_directory(self.config.training_data)
            self.validation_images = self._count_images_in_directory(self.config.validation_data)

            # Fitting the model
            for epoch in range(self.last_epoch+1, self.config.params_epochs + self.additional_epochs):
                history = self.output_model.fit(
                    self.training_generator,
                    validation_data=self.valid_generator,
                    initial_epoch=epoch,         # Sets starting point for correct logging
                    epochs=epoch+1,              # Only running 1 epoch at a time
                    steps_per_epoch=ceil(self.training_images / self.config.params_batch_size),
                    validation_steps=ceil(self.validation_images / self.config.params_batch_size),
                )

                # Updating number of epochs completed
                self.last_epoch = epoch + 1

                if self.config.params_checkpoint:
                    # Accessing accurary scores
                    train_acc = history.history.get("accuracy", [0])[0]
                    val_acc = history.history.get("val_accuracy", [0])[0]

                    # If current model is better than prior best model, saving the model
                    if val_acc > self.best_val_accuracy:
                        self.best_val_accuracy = val_acc
                        model_path = Path(self.checkpoint_path / f"model_e{epoch+1:02d}_acc{train_acc:.4f}_vacc{val_acc:.4f}.h5")
                        self._save_model(save_path=model_path, model=self.output_model)
                        logger.info(f"Saved new best model at {model_path}")

            logger.info("Successfully trained model based on provided parameters.")
            self._save_model(save_path=self.config.trained_model_path, model=self.output_model)

        except Exception as exception_error:
            logger.error(f"Unexpected error while training the model: {exception_error}")
            raise


    def resume_train(self, add_epochs: int) -> None:
        """
        Resumes model training for additional number of epochs.
        """
        try:
            if self.additional_epochs == None:
                self.additional_epochs = 0
            self.additional_epochs += add_epochs

            self.train()

        except Exception as exception_error:
            logger.error(f"Unexpected error while resuming training: {exception_error}")
            raise
    

    def save_class_indices(self) -> None:
        """
        Saves the class index mapping as a JSON file for future reference.
        """
        if self.training_generator == None:
            logger.error("Class indices not found. Run get_data_generators() before calling save_class_indices().")
            raise ValueError("Class indices not found. Run get_data_generators() before calling save_class_indices().")

        try:
            save_path = Path(self.config.root_dir / "class_indices.json")
            save_json(save_path=save_path, data=self.training_generator.class_indices)

            if self.config.params_checkpoint and self.checkpoint_path.exists():
                checkpoint_save_path = Path(self.checkpoint_path / "class_indices.json")
                save_json(save_path=checkpoint_save_path, data=self.training_generator.class_indices)

        except Exception as exception_error:
            logger.error(f"Unexpected error while saving class indices: {exception_error}")
            raise


    def _build_generator(self, datagen: ImageDataGenerator, data_path: Union[str, Path], tag: str) -> DirectoryIterator:
        """
        Helper to build a flow_from_directory generator with consistent options.

        Args:
        - datagen (ImageDataGenerator): Instance of the ImageDataGenerator.
        - data_path (Union[str, Path]): Path to the directory containing images.
        - tag (str): Label for logging context ("Train" or "Valid").

        Returns:
        - DirectoryIterator: Configured Keras generator for the given directory.
        """
        try:
            data_path = Path(data_path)

            if not data_path.exists():
                logger.error(f"{tag.title()} directory not found: {data_path}")
                raise FileNotFoundError(f"{tag.title()} directory not found: {data_path}")

            # Building generator
            generator_unit = datagen.flow_from_directory(
                directory=data_path,
                target_size=self.config.params_image_size[:2],
                batch_size=self.config.params_batch_size,
                class_mode="categorical",
                shuffle=True,
            )

            return generator_unit

        except Exception as exception_error:
            logger.error(f"Unexpected error while build generator: {exception_error}")
            raise
    

    def _create_checkpoint(self) -> None:
        """
        Creates a checkpoint directory and saves the training configuration as a JSON file.

        Purpose:
        - Ensures the checkpoint directory exists.
        - Saves the current training configuration (hyperparameters) used in that run.
        - Helps with reproducibility and traceability for saved models.
        """
        try:
            logger.info("Creating checkpoint directory...")

            save_path = Path(self.checkpoint_path / "params_used.json")
            create_directories([self.checkpoint_path])

            # Convert all Path objects to str recursively
            config_dict = self._convert_paths_to_str(asdict(self.config))

            save_json(save_path=save_path, data=config_dict)
            
            logger.info(f"Checkpoint directory created.")

        except Exception as exception_error:
            logger.error(f"Unexpected error while creating checkpoint directroy: {exception_error}")
            raise 


    def _get_optimizer(self) -> tf.keras.optimizers.Optimizer:
        """
        Dynamically selects and returns a TensorFlow optimizer based on the configuration.

        Returns:
            tf.keras.optimizers.Optimizer: Configured optimizer instance for model compilation.
        """
        try:
            # Normalize optimizer name to lowercase for consistent matching
            optimizer_name = self.config.params_optimizer.strip().upper()
            optimizer = None

            # Select optimizer based on configuration
            if optimizer_name == "SGD":
                optimizer = tf.keras.optimizers.SGD(learning_rate=self.config.params_learning_rate)

            elif optimizer_name == "RMSPROP":
                optimizer = tf.keras.optimizers.RMSprop(learning_rate=self.config.params_learning_rate) 

            else:
                # Default to Adam if unsupported optimizer name is provided
                if optimizer_name != "ADAM":
                    logger.info(f"Unsupported optimizer name {optimizer_name} provided. Falling back to 'Adam'.")
                    optimizer_name = "ADAM"

                optimizer = tf.keras.optimizers.Adam(learning_rate=self.config.params_learning_rate)

            logger.info(f"Optimizer '{optimizer_name}' initialized and returned.")
            return optimizer
        
        except Exception as exception_error:
            logger.error(f"Unexpected error while loading optimizer: {exception_error}")
            raise


    @staticmethod
    def _count_images_in_directory(directory_path: Union[str, Path]) -> int:
        """
        Counts the total number of image files in a directory and its subfolders.

        Args:
        - directory_path (str or Path): Path to the dataset root (e.g., train or valid)

        Returns:
        - int: Total number of images found
        """
        try:
            directory_path = Path(directory_path)
            total_images = 0

            if not directory_path.exists():
                logger.error(f"Could not find the path {directory_path}")
                raise FileNotFoundError(f"Could not find the path {directory_path}")
            
            for _, _, files in os.walk(directory_path):
                total_images += len([f for f in files if f.lower().endswith((".png", ".jpg", ".jpeg"))])

            if total_images == 0:
                logger.error(f"No images found in {directory_path}")
                raise ValueError(f"No images found in {directory_path}")       

            return total_images

        except Exception as exception_error:
            logger.error(f"Unexpected error while counting images in directory: {exception_error}")
            raise

    
    @staticmethod
    def _convert_paths_to_str(obj: dict) -> dict:
        """
        Recursively convert Path objects in a nested dictionary to strings.
        """
        output = {}
        for key, value in obj.items():
            if isinstance(value, Path):
                output[key] = str(value)
            elif isinstance(value, dict):
                output[key] = ModelTraining._convert_paths_to_str(value)
            else:
                output[key] = value
        return output


    @staticmethod
    def _save_model(save_path: Path, model: tf.keras.Model) -> None:
        """
        Saves a given model to the specified path.
        """
        try:
            create_directories([save_path.parent])
            model.save(save_path)
            logger.info(f"Model saved at: {save_path}")
        
        except Exception as exception_error:
            logger.error(f"Unexpected error while saving the model at {save_path}: {exception_error}")
            raise

2025-07-05 14:21:18.696684: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-05 14:21:18.896073: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-05 14:21:19.969284: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
try:
    config_manager = ConfigurationManager()
    training_config = config_manager.get_training_config()

    training_constructor = ModelTraining(config=training_config)
    training_constructor.get_base_model()
    training_constructor.get_data_generators()
    training_constructor.train()
    training_constructor.save_class_indices()
    training_constructor.resume_train(add_epochs=1)

except Exception as exception_error:
    logger.exception(f"Unexpected error during model training pipeline: {exception_error}")
    raise

[2025-07-05 14:21:25,388: INFO: common: YAML file: config/config.yaml loaded successfully]
[2025-07-05 14:21:25,397: INFO: common: YAML file: params/params.yaml loaded successfully]
[2025-07-05 14:21:25,398: INFO: 377247663: Loading configuration from config/config.yaml and parameters from params/params.yaml]
[2025-07-05 14:21:25,402: INFO: common: Directory: artifacts created successfully.]
[2025-07-05 14:21:25,403: INFO: common: Directory: artifacts/model_training created successfully.]
[2025-07-05 14:21:25,404: INFO: 377247663: ModelTrainingConfig created with: ModelTrainingConfig(root_dir=PosixPath('artifacts/model_training'), trained_model_path=PosixPath('artifacts/model_training/trained_model.h5'), updated_base_model=PosixPath('artifacts/base_model/updated_base_model.h5'), training_data=PosixPath('artifacts/data_ingestion/Data/train'), validation_data=PosixPath('artifacts/data_ingestion/Data/valid'), params_augmentation=True, params_checkpoint=True, params_image_size=(224, 224, 3



[2025-07-05 14:21:25,929: INFO: 828250690: Successfully loaded the base model from artifacts/base_model/updated_base_model.h5.]


INFO:cnnClassifierLogger_running:Successfully loaded the base model from artifacts/base_model/updated_base_model.h5.


[2025-07-05 14:21:25,933: INFO: 828250690: Optimizer 'ADAM' initialized and returned.]


INFO:cnnClassifierLogger_running:Optimizer 'ADAM' initialized and returned.


[2025-07-05 14:21:25,936: INFO: 828250690: Successfully recomplied the model.]


INFO:cnnClassifierLogger_running:Successfully recomplied the model.


[2025-07-05 14:21:25,937: INFO: 828250690: Preparing ImageDataGenerators...]


INFO:cnnClassifierLogger_running:Preparing ImageDataGenerators...


Found 613 images belonging to 4 classes.
Found 72 images belonging to 4 classes.
[2025-07-05 14:21:25,962: INFO: 828250690: ImageDataGenerators created successfully.]


INFO:cnnClassifierLogger_running:ImageDataGenerators created successfully.


[2025-07-05 14:21:25,964: INFO: 828250690: Creating checkpoint directory...]


INFO:cnnClassifierLogger_running:Creating checkpoint directory...


[2025-07-05 14:21:25,966: INFO: common: Directory: artifacts/model_training/Checkpoint_20250705_1421 created successfully.]


INFO:cnnClassifierLogger_test:Directory: artifacts/model_training/Checkpoint_20250705_1421 created successfully.


[2025-07-05 14:21:25,968: INFO: common: Directory: artifacts/model_training/Checkpoint_20250705_1421 created successfully.]


INFO:cnnClassifierLogger_test:Directory: artifacts/model_training/Checkpoint_20250705_1421 created successfully.


[2025-07-05 14:21:25,970: ERROR: common: Failed to seialize data to JSON at artifacts/model_training/Checkpoint_20250705_1421/params_used.json: Object of type PosixPath is not JSON serializable]


ERROR:cnnClassifierLogger_test:Failed to seialize data to JSON at artifacts/model_training/Checkpoint_20250705_1421/params_used.json: Object of type PosixPath is not JSON serializable


[2025-07-05 14:21:25,971: ERROR: 828250690: Unexpected error while creating checkpoint directroy: Object of type PosixPath is not JSON serializable]


ERROR:cnnClassifierLogger_running:Unexpected error while creating checkpoint directroy: Object of type PosixPath is not JSON serializable


[2025-07-05 14:21:25,972: ERROR: 1222839666: Unexpected error during model training pipeline: Object of type PosixPath is not JSON serializable]
Traceback (most recent call last):
  File "/tmp/ipykernel_28161/1222839666.py", line 8, in <module>
    training_constructor.train()
  File "/tmp/ipykernel_28161/828250690.py", line 139, in train
    self._create_checkpoint()
  File "/tmp/ipykernel_28161/828250690.py", line 268, in _create_checkpoint
    save_json(save_path=save_path, data=asdict(self.config))
  File "/workspaces/TumorTracer/src/cnnClassifier/utils/common.py", line 134, in save_json
    raise exception_error
  File "/workspaces/TumorTracer/src/cnnClassifier/utils/common.py", line 127, in save_json
    json.dump(data, file, indent=4)
  File "/home/codespace/.python/current/lib/python3.12/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/codespace/.python/current/lib/python3.12/json/encoder.py", line 432, in _iterencode
    yield from _iterencode_dict

ERROR:cnnClassifierLogger_running:Unexpected error during model training pipeline: Object of type PosixPath is not JSON serializable
Traceback (most recent call last):
  File "/tmp/ipykernel_28161/1222839666.py", line 8, in <module>
    training_constructor.train()
  File "/tmp/ipykernel_28161/828250690.py", line 139, in train
    self._create_checkpoint()
  File "/tmp/ipykernel_28161/828250690.py", line 268, in _create_checkpoint
    save_json(save_path=save_path, data=asdict(self.config))
  File "/workspaces/TumorTracer/src/cnnClassifier/utils/common.py", line 134, in save_json
    raise exception_error
  File "/workspaces/TumorTracer/src/cnnClassifier/utils/common.py", line 127, in save_json
    json.dump(data, file, indent=4)
  File "/home/codespace/.python/current/lib/python3.12/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/codespace/.python/current/lib/python3.12/json/encoder.py", line 432, in _iterencode
    yield from _iterencode_dict(o, _current

TypeError: Object of type PosixPath is not JSON serializable