# Training

In [1]:
import torch
from ultralytics import YOLO
from ultralytics.models.yolo.multi import MultiTaskTrainer
import datetime, os, glob

model_name='mtyolov8'
# list_task = ['pose', 'segment', 'multitask']
list_task = ['multitask']

# list_model_type = ['', '_ECA']
list_model_type = ['_ECA']
# list_pretrained = ['', '_pretrained']
list_pretrained = ['']
# list_dataset = ['coco', 'cattleeyeview']
list_dataset = ['coco']
dir_mtYOLO_root = 'C:/Users/nikhi/Desktop/mtYOLO'
epochs = 2
patience = 0
device = [0]
image_size = 640
batch_size = 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


def time_now():
    return str(datetime.datetime.now())
    
def yolo_train(task, 
         model_type, 
         dir_mtYOLO_root, 
         dataset,
         pretrained='', 
         loss_type='', 
         device=[0],
         epochs=1, 
         patience=0, 
         image_size=640,
         batch_size=-1,
         model_name='mtyolov8'
         ):

    ## Check if config files exist
    dir_model = f"{dir_mtYOLO_root}/config/model/{model_name}_{task}_{dataset}{model_type}.yaml"
    dir_data = f"{dir_mtYOLO_root}/config/dataset/{dataset}_{task}.yaml"
    dir_log = f"{os.path.dirname(dir_mtYOLO_root)}/logs"
    print(f"{dir_model} exists: {os.path.exists(dir_model)}")
    print(f"{dir_data} exists: {os.path.exists(dir_data)}")

    ## Naming configuration
    model_name = dir_model.split('/')[-1].split('.')[0]
    experiment_name = f'{time_now()[:13]}_{model_name}{pretrained}'
    
    args = dict(
        model=dir_model, #Specifies the model file for training. Accepts a path to either a .pt pretrained model or a .yaml configuration file. Essential for defining the model structure or initializing weights.
        data=dir_data, #Path to the dataset configuration file (e.g., coco128.yaml). This file contains dataset-specific parameters, including paths to training and validation data, class names, and number of classes.
        project=f'{dir_log}', #None, #Name of the project directory where training outputs are saved. Allows for organized storage of different experiments.
        name=f'{experiment_name}', #Name of the training run. Used for creating a subdirectory within the project folder, where training logs and outputs are stored.
        exist_ok=True, #If True, allows overwriting of an existing project/name directory. Useful for iterative experimentation without needing to manually clear previous outputs.
    
        imgsz=image_size, #Target image size for training. All images are resized to this dimension before being fed into the model. Affects model accuracy and computational complexity.
        batch=batch_size, #16, #Batch size for training, indicating how many images are processed before the model's internal parameters are updated. AutoBatch (batch=-1) dynamically adjusts the batch size based on GPU memory availability.
        epochs=epochs, #Total number of training epochs. Each epoch represents a full pass over the entire dataset. Adjusting this value can affect training duration and model performance.
        cache=True, #Enables caching of dataset images in memory (True/ram), on disk (disk), or disables it (False). Improves training speed by reducing disk I/O at the cost of increased memory usage.
    #     fraction=1, #Specifies the fraction of the dataset to use for training. Allows for training on a subset of the full dataset, useful for experiments or when resources are limited.
    #     seed=0, #Sets the random seed for training, ensuring reproducibility of results across runs with the same configurations.
    #     deterministic=False, #Forces deterministic algorithm use, ensuring reproducibility but may affect performance and speed due to the restriction on non-deterministic algorithms.
    #     pretrained=True, #Determines whether to start training from a pretrained model. Can be a boolean value or a string path to a specific model from which to load weights. Enhances training efficiency and model performance.
    #     resume=False, #Resumes training from the last saved checkpoint. Automatically loads model weights, optimizer state, and epoch count, continuing training seamlessly.
    #     freeze=None, #Freezes the first N layers of the model or specified layers by index, reducing the number of trainable parameters. Useful for fine-tuning or transfer learning.
    #     time=None, #Maximum training time in hours. If set, this overrides the epochs argument, allowing training to automatically stop after the specified duration. Useful for time-constrained training scenarios.
        patience=patience, #Number of epochs to wait without improvement in validation metrics before early stopping the training. Helps prevent overfitting by stopping training when performance plateaus.
        verbose=False, #Enables verbose output during training, providing detailed logs and progress updates. Useful for debugging and closely monitoring the training process.
    
        device=device, #Specifies the computational device(s) for training: a single GPU (device=0), multiple GPUs (device=0,1), CPU (device=cpu), or MPS for Apple silicon (device=mps).
        workers=32, #8, #Number of worker threads for data loading (per RANK if Multi-GPU training). Influences the speed of data preprocessing and feeding into the model, especially useful in multi-GPU setups.
    
        optimizer='auto', # 'AdamW' #Choice of optimizer for training. Options include SGD, Adam, AdamW, NAdam, RAdam, RMSProp etc., or auto for automatic selection based on model configuration. Affects convergence speed and stability.
    #     lr0=0.01, #Initial learning rate (i.e. SGD=1E-2, Adam=1E-3) . Adjusting this value is crucial for the optimization process, influencing how rapidly model weights are updated.
    #     warmup_epochs=3, #Number of epochs for learning rate warmup, gradually increasing the learning rate from a low value to the initial learning rate to stabilize training early on.
    #     warmup_momentum=0.8, #Initial momentum for warmup phase, gradually adjusting to the set momentum over the warmup period.
    #     warmup_bias_lr=0.1, #Learning rate for bias parameters during the warmup phase, helping stabilize model training in the initial epochs.
    #     lrf=0.01, #Final learning rate as a fraction of the initial rate = (lr0 * lrf), used in conjunction with schedulers to adjust the learning rate over time.
    #     cos_lr=False, #Utilizes a cosine learning rate scheduler, adjusting the learning rate following a cosine curve over epochs. Helps in managing learning rate for better convergence.
    #     momentum=0.937, #Momentum factor for SGD or beta1 for Adam optimizers, influencing the incorporation of past gradients in the current update.
    #     weight_decay=0.0005, #L2 regularization term, penalizing large weights to prevent overfitting.
        
        # mask_ratio=0, #4, #Downsample ratio for segmentation masks, affecting the resolution of masks used during training.
        dropout=0, #Dropout rate for regularization in classification tasks, preventing overfitting by randomly omitting units during training.
        
    #     single_cls=False, #Treats all classes in multi-class datasets as a single class during training. Useful for binary classification tasks or when focusing on object presence rather than classification.
        rect=False, #Enables rectangular training, optimizing batch composition for minimal padding. Can improve efficiency and speed but may affect model accuracy.
        
        close_mosaic=0, #10, #Disables mosaic data augmentation in the last N epochs to stabilize training before completion. Setting to 0 disables this feature.
        # hsv_h=0.015, #0.0-1.0	Adjusts the hue of the image by a fraction of the color wheel, introducing color variability. Helps the model generalize across different lighting conditions.
        # hsv_s=0.7, #0.0-1.0	Alters the saturation of the image by a fraction, affecting the intensity of colors. Useful for simulating different environmental conditions.
        # hsv_v=0.4, #0.0-1.0	Modifies the value (brightness) of the image by a fraction, helping the model to perform well under various lighting conditions.
        # degrees=0.0, #-180-+180	Rotates the image randomly within the specified degree range, improving the model's ability to recognize objects at various orientations.
        # translate=0.0, #0.0-1.0	Translates the image horizontally and vertically by a fraction of the image size, aiding in learning to detect partially visible objects.
        # scale=0.0, #>=0.0	Scales the image by a gain factor, simulating objects at different distances from the camera.
        # shear=0.0, #-180-+180	Shears the image by a specified degree, mimicking the effect of objects being viewed from different angles.
        # perspective=0.0, #0.0-0.001	Applies a random perspective transformation to the image, enhancing the model's ability to understand objects in 3D space.
        # flipud=0.0, #0.0-1.0	Flips the image upside down with the specified probability, increasing the data variability without affecting the object's characteristics.
        # fliplr=0.5, #0.0-1.0	Flips the image left to right with the specified probability, useful for learning symmetrical objects and increasing dataset diversity.
        mosaic=0.0, #0.0-1.0	Combines four training images into one, simulating different scene compositions and object interactions. Highly effective for complex scene understanding.
        # mixup=0.0, #0.0-1.0	Blends two images and their labels, creating a composite image. Enhances the model's ability to generalize by introducing label noise and visual variability.
        # copy_paste=0.0, #0.0-1.0	Copies objects from one image and pastes them onto another, useful for increasing object instances and learning object occlusion.
        # auto_augment='randaugment', # Automatically appliesa predefined augmentation policy (randaugment, autoaugment, augmix), optimizing for classification tasks by diversifying the visual features.
        # erasing=0.4, #0.0-1.0	Randomly erases a portion of the image during classification training, encouraging the model to focus on less obvious features for recognition.
        
        # box=5,#7.5, #Weight of the box loss component in the loss function, influencing how much emphasis is placed on accurately predicting bounding box coordinates.
        # cls=5,#0.5, #Weight of the classification loss in the total loss function, affecting the importance of correct class prediction relative to other components.
        # dfl=10,#1.5, #Weight of the distribution focal loss, used in certain YOLO versions for fine-grained classification.
        # pose=20,#12, #Weight of the pose loss in models trained for pose estimation, influencing the emphasis on accurately predicting pose keypoints.
        # kobj=10,#2, #Weight of the keypoint objectness loss in pose estimation models, balancing detection confidence with pose accuracy.
        # nbs=64, #Nominal batch size for normalization of loss.
        
        # label_smoothing=0, #Applies label smoothing, softening hard labels to a mix of the target label and a uniform distribution over labels, can improve generalization.
        overlap_mask=False, #True, #Determines whether segmentation masks should overlap during training, applicable in instance segmentation tasks.
    
        val=True, #Enables validation during training, allowing for periodic evaluation of model performance on a separate dataset.
        plots=True, #Generates and saves plots of training and validation metrics, as well as prediction examples, providing visual insights into model performance and learning progression.
        save=True, #Enables saving of training checkpoints and final model weights. Useful for resuming training or model deployment.
        save_period=-1, #Frequency of saving model checkpoints, specified in epochs. A value of -1 disables this feature. Useful for saving interim models during long training sessions.
    
    #     profile=False, #Enables profiling of ONNX and TensorRT speeds during training, useful for optimizing model deployment.
    #     amp=True, #Enables Automatic Mixed Precision (AMP) training, reducing memory usage and possibly speeding up training with minimal impact on accuracy.
       
    )

    print(f'{model_name}{model_type}{pretrained} {task} training starts: {time_now()} ')

    ## Start training multitask model
    if task=='multitask':
        trainer = MultiTaskTrainer(overrides=args)
        trainer.train()
        
    else:
        ## Load pre-trained YOLO model
        if pretrained!='':
            ## Change name to load pre-trained YOLO model
            if task=='segment':
                task_type='seg'
            else:
                task_type=task
            model = YOLO(dir_model).load(f'yolov8n-{task_type}.pt')
        
        else:
            task_type=task
            model = YOLO(dir_model)
            
        model.train(**args)

    print(f'{model_name}{model_type}{pretrained} {task} training ends: {time_now()} \n')



In [3]:
print(f"CUDA Visible Devices: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")

CUDA Visible Devices: 1
Device 0: NVIDIA GeForce RTX 4050 Laptop GPU


In [None]:
for dataset in list_dataset:
    for task in list_task:
        for model_type in list_model_type:
            for pretrained in list_pretrained:
                if (task=='multitask') and (pretrained!=''):
                    None
                else:
                    print(f'Dataset: {dataset}, Task: {task}, Model: {model_type}{pretrained}')

                    ## Start training
                    yolo_train(task=task, 
                               model_type=model_type, 
                               dir_mtYOLO_root=dir_mtYOLO_root, 
                               dataset=dataset,
                               pretrained=pretrained,
                               device=device,
                               epochs=epochs, 
                               patience=patience,   
                               image_size=image_size,
                               batch_size=batch_size,
                               model_name=model_name
                              )

### Experimentation with Conv Blocks
**Experiment Setup** : 2 epochs, 5 train images, 5 val images, ECA model, multitask (det, pose, seg), RTX 4050 6GB GPU

#### Base Conv:
- Params :5051264
- GFLOPs :13.2
- Test mAP50 = {
    box : 0.0237,
    pose : 0,
    mask: 0}
- Speed: 1.4ms preprocess, 20.0ms inference, 0.0ms loss, 3.2ms postprocess per image

#### CBAM Conv:
- Params : 5354045
- GLOPs: 25.8
- Test mAP50 = {
    box : 0.0315,
    pose : 0.000372,
    mask: 0.00145}
- Time to Complete : 0.005 hrs
- Speed: 0.4ms preprocess, 73.1ms inference, 0.0ms loss, 2.3ms postprocess per image


#### ConvNeXt Conv
- Params : 11963050
- GLOPs : 30.4
- Test mAP50 = {
    box : 0.0233,
    pose : 0,
    mask: 0.00181}
- Time to complete : 0.005 hrs
- Speed: 0.8ms preprocess, 56.9ms inference, 0.0ms loss, 2.4ms postprocess per image


#### InceptionNeXt Conv
- Params : 11942464
- GLOPs : 31.5
- Test mAP50 = {
    box : 0.0337,
    pose : 0.00159,
    mask: 0.00181}
- Time to complete : 0.003 hrs
- Speed: 1.8ms preprocess, 76.9ms inference, 0.0ms loss, 3.1ms postprocess per image


# Validation / Prediction

In [None]:
## Validation and Prediction
import torch
from ultralytics import YOLO
from ultralytics.models.yolo.multi import MultiTaskPredictor, MultiTaskValidator
import datetime, os, glob

model_mode = 'predict' #['val', 'predict']
list_task = ['multitask']
list_model_type = ['_ECA'] 
list_pretrained = [''] # Load existing pre-trained YOLO models (Only for pose and segmentation)
list_dataset = ['coco']
dir_log_root = 'C:/Users/nikhi/Desktop/mtYOLO/log_val' 
dir_image = "C:/Users/nikhi/Desktop/val"

device = [0]
image_size = 640
batch_size = 256


for dataset in list_dataset[:1]:
    for task in list_task[:1]:
        for model_type in list_model_type[:1]:
            for pretrained in list_pretrained[:1]:
                dir_model_checkpoint = max([file for file in sorted(glob.iglob(f'C:/Users/nikhi/Desktop/mtYOLO/model_checkpoint/mtyolov8_coco_multitask_ECA.pt', recursive=True))])
                print(f'Task: {task}, Model: YOLOv8{model_type}{pretrained}, Directory of latest checkpoint: {dir_model_checkpoint}')
                print(f"{dir_image} exists: {os.path.exists(dir_image)}")

                ## Configuration
                args = dict(
                    model=dir_model_checkpoint, #Specifies the model file for training. Accepts a path to either a .pt pretrained model or a .yaml configuration file. Essential for defining the model structure or initializing weights.
                    )

                ## Load model
                model = YOLO(**args)
                
                ## Print model information
                model.info()

                if model_mode=='val':
                    ## Validation
                    model.val(
                        data = dir_image, #None, # Specifies the path to the dataset configuration file (e.g., coco8.yaml). This file includes paths to validation data, class names, and number of classes.
                        imgsz = image_size, #640, # Defines the size of input images. All images are resized to this dimension before processing.
                        # batch = 16, # Sets the number of images per batch. Use -1 for AutoBatch, which automatically adjusts based on GPU memory availability.
                        # save_json = False, # If True, saves the results to a JSON file for further analysis or integration with other tools.
                        # save_hybrid = True, #False, # If True, saves a hybrid version of labels that combines original annotations with additional model predictions.
                        # conf = 0.001, # Sets the minimum confidence threshold for detections. Detections with confidence below this threshold are discarded.
                        # iou = 0.6, # Sets the Intersection Over Union (IoU) threshold for Non-Maximum Suppression (NMS). Helps in reducing duplicate detections.
                        # max_det = 300, # Limits the maximum number of detections per image. Useful in dense scenes to prevent excessive detections.
                        # half = True, # Enables half-precision (FP16) computation, reducing memory usage and potentially increasing speed with minimal impact on accuracy.
                        device = device, #None, # Specifies the device for validation (cpu, cuda:0, etc.). Allows flexibility in utilizing CPU or GPU resources.
                        # dnn = False, # If True, uses the OpenCV DNN module for ONNX model inference, offering an alternative to PyTorch inference methods.
                        plots = True, # False, # When set to True, generates and saves plots of predictions versus ground truth for visual evaluation of the model's performance.
                        # rect = False, # If True, uses rectangular inference for batching, reducing padding and potentially increasing speed and efficiency.
                        # split = 'val' # Determines the dataset split to use for validation (val, test, or train). Allows flexibility in choosing the data segment for performance evaluation.
                    )

                if model_mode=='predict':
                    ## Predict
                    model.predict(
                        source = dir_image, #	Specifies the data source for inference. Can be an image path, video file, directory, URL, or device ID for live feeds. Supports a wide range of formats and sources, enabling flexible application across different types of input.
                        # conf = 0.25, # Sets the minimum confidence threshold for detections. Objects detected with confidence below this threshold will be disregarded. Adjusting this value can help reduce false positives.
                        # iou = 0.7, # Intersection Over Union (IoU) threshold for Non-Maximum Suppression (NMS). Lower values result in fewer detections by eliminating overlapping boxes, useful for reducing duplicates.
                        # imgsz = 640, # Defines the image size for inference. Can be a single integer 640 for square resizing or a (height, width) tuple. Proper sizing can improve detection accuracy and processing speed.
                        # half = False, # Enables half-precision (FP16) inference, which can speed up model inference on supported GPUs with minimal impact on accuracy.
                        # device = None, # Specifies the device for inference (e.g., cpu, cuda:0 or 0). Allows users to select between CPU, a specific GPU, or other compute devices for model execution.
                        # max_det = 300, # Maximum number of detections allowed per image. Limits the total number of objects the model can detect in a single inference, preventing excessive outputs in dense scenes.
                        # vid_stride = 1, # Frame stride for video inputs. Allows skipping frames in videos to speed up processing at the cost of temporal resolution. A value of 1 processes every frame, higher values skip frames.
                        # stream_buffer = False, # Determines if all frames should be buffered when processing video streams (True), or if the model should return the most recent frame (False). Useful for real-time applications.
                        # visualize = True, #False, # Activates visualization of model features during inference, providing insights into what the model is "seeing". Useful for debugging and model interpretation.
                        # augment = False, # Enables test-time augmentation (TTA) for predictions, potentially improving detection robustness at the cost of inference speed.
                        # agnostic_nms = False, # Enables class-agnostic Non-Maximum Suppression (NMS), which merges overlapping boxes of different classes. Useful in multi-class detection scenarios where class overlap is common.
                        # classes = None, # Filters predictions to a set of class IDs. Only detections belonging to the specified classes will be returned. Useful for focusing on relevant objects in multi-class detection tasks.
                        # retina_masks = False, # Uses high-resolution segmentation masks if available in the model. This can enhance mask quality for segmentation tasks, providing finer detail.
                        # embed = None, # Specifies the layers from which to extract feature vectors or embeddings. Useful for downstream tasks like clustering or similarity search.
                                        
                        show = True, # If True, displays the annotated images or videos in a window. Useful for immediate visual feedback during development or testing.
                        # save = False, # Enables saving of the annotated images or videos to file. Useful for documentation, further analysis, or sharing results.
                        # save_frames = False, # When processing videos, saves individual frames as images. Useful for extracting specific frames or for detailed frame-by-frame analysis.
                        # save_txt = False, # Saves detection results in a text file, following the format [class] [x_center] [y_center] [width] [height] [confidence]. Useful for integration with other analysis tools.
                        # save_conf = False, # Includes confidence scores in the saved text files. Enhances the detail available for post-processing and analysis.
                        # save_crop = False, # Saves cropped images of detections. Useful for dataset augmentation, analysis, or creating focused datasets for specific objects.
                        # show_labels = True, # Displays labels for each detection in the visual output. Provides immediate understanding of detected objects.
                        # show_conf = True, # Displays the confidence score for each detection alongside the label. Gives insight into the model's certainty for each detection.
                        # show_boxes = True, # Draws bounding boxes around detected objects. Essential for visual identification and location of objects in images or video frames.
                        # line_width = None	# Specifies the line width of bounding boxes. If None, the line width is automatically adjusted based on the image size. Provides visual customization for clarity.
                    )