diff --git a/CHANGELOG.md b/CHANGELOG.md index 80a1bada..ab49db99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,19 +2,48 @@ ## New Features: -No changes to highlight. +- ## Bug Fixes: -No changes to highlight. +- ## Breaking Changes: -No changes to highlight. +- ## Other Changes: -No changes to highlight. +- + +# v0.0.10 + +## New Features: + +- Add a gpu option in `train_with_config` (only single-GPU supported) by `@deepkyu` in [PR 219](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/219) +- Support augmentation for classification task: cutmix, mixup by `@illian01` in [PR 221](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/221) +- Add model: MixNet by `@illian01` in [PR 229](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/229) +- Add `model.name` to get the exact nickname of the model by `@deepkyu` in [PR 243](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/243/) +- Add transforms: RandomErasing and TrivialAugmentationWide by `@illian01` in [PR 246](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/246) + +## Bug Fixes: + +- Fix PIDNet model dataclass task field by `@illian01` in [PR 220](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/220) +- Fix default criterion value of classification `@illian01` in [PR 238](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/238) +- Fix model access of 2-stage detection pipeline to compat with distributed environment by `@illian` in [PR 239](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/239) + +## Breaking Changes: + +- Enable dataset augmentation customizing by `@illian01` in [PR 201](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/201) +- Add postprocessor module by `@illian01` in [PR 223](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/223) +- Equalize the model backbone configuration format by `@illian01` in [PR 228](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/228) +- Separate FPN and PAFPN as neck module by `@illian01` in [PR 234](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/234) +- Auto-download pretrained checkpoint from AWS S3 by `@deepkyu` in [PR 244](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/244) + +## Other Changes: + +- Update ruff rule (`W`) by `@deepkyu` in [PR 218](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/218) +- Integrate classification loss modules by `@illian01` in [PR 226](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/226) # v0.0.9 @@ -121,6 +150,7 @@ This change is applied at [PR 151](https://github.com/Nota-NetsPresso/netspresso - Initialize loss and metric at same time with optimizer and lr schedulers by `@deepkyu` in [PR 138](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/138) - Hotfix the error which shows 0 for validation loss and metrics by fixing the variable name by `@deepkyu` in [PR 140](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/140) - Add missing field, `save_optimizer_state`, in `logging.yaml` by `@illian01` in [PR 149](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/149) +- Hotfix for pythonic config name (classification loss) by `@deepkyu` in [PR 242](https://github.com/Nota-NetsPresso/netspresso-trainer/pull/242) ## Breaking Changes: diff --git a/config/augmentation/classification.yaml b/config/augmentation/classification.yaml index d8ac19f9..0648b74b 100644 --- a/config/augmentation/classification.yaml +++ b/config/augmentation/classification.yaml @@ -1,31 +1,14 @@ augmentation: img_size: &img_size 256 - hsv_h: ~ - hsv_s: ~ - hsv_v: ~ - degrees: ~ - translate: ~ - scale: ~ - max_scale: ~ - min_scale: ~ - crop_size_h: ~ - crop_size_w: ~ - resize_ratio0: ~ - resize_ratiof: ~ - resize_add: ~ - shear: ~ - perspective: ~ - flipud: ~ - fliplr: 0.5 - mosaic: ~ - mixup: 1.0 - copy_paste: ~ - mixup_alpha: 0.0 - cutmix_alpha: 0.0 - mixup_switch_prob: 0.5 - color_jitter: - brightness: ~ - contrast: ~ - saturation: ~ - hue: ~ - colorjitter_p: ~ \ No newline at end of file + transforms: + - + name: randomresizedcrop + size: *img_size + interpolation: bilinear + - + name: randomhorizontalflip + p: 0.5 + mix_transforms: + - + name: cutmix + alpha: 1.0 diff --git a/config/augmentation/detection.yaml b/config/augmentation/detection.yaml index 4dafab46..11022618 100644 --- a/config/augmentation/detection.yaml +++ b/config/augmentation/detection.yaml @@ -1,31 +1,7 @@ augmentation: img_size: &img_size 512 - hsv_h: ~ - hsv_s: ~ - hsv_v: ~ - degrees: ~ - translate: ~ - scale: ~ - max_scale: 2048 - min_scale: 768 - crop_size_h: 512 - crop_size_w: 512 - resize_ratio0: 0.5 - resize_ratiof: 2.0 - resize_add: 1 - shear: ~ - perspective: ~ - flipud: ~ - fliplr: 0.5 - mosaic: ~ - mixup: ~ - copy_paste: ~ - mixup_alpha: ~ - cutmix_alpha: ~ - mixup_switch_prob: ~ - color_jitter: - brightness: 0.25 - contrast: 0.25 - saturation: 0.25 - hue: 0.1 - colorjitter_p: 0.5 \ No newline at end of file + transforms: + - + name: resize + size: *img_size + interpolation: bilinear diff --git a/config/augmentation/segmentation.yaml b/config/augmentation/segmentation.yaml index 48dae02f..d878f5af 100644 --- a/config/augmentation/segmentation.yaml +++ b/config/augmentation/segmentation.yaml @@ -1,31 +1,17 @@ augmentation: img_size: &img_size 512 - hsv_h: ~ - hsv_s: ~ - hsv_v: ~ - degrees: ~ - translate: ~ - scale: ~ - max_scale: 1024 - min_scale: *img_size - crop_size_h: *img_size - crop_size_w: *img_size - resize_ratio0: 1.0 - resize_ratiof: 1.5 - resize_add: 1 - shear: ~ - perspective: ~ - flipud: ~ - fliplr: 0.5 - mosaic: ~ - mixup: ~ - copy_paste: ~ - mixup_alpha: ~ - cutmix_alpha: ~ - mixup_switch_prob: ~ - color_jitter: - brightness: 0.25 - contrast: 0.25 - saturation: 0.25 - hue: 0.1 - colorjitter_p: 0.5 \ No newline at end of file + transforms: + - + name: randomresizedcrop + size: *img_size + interpolation: bilinear + - + name: randomhorizontalflip + p: 0.5 + - + name: colorjitter + brightness: 0.25 + contrast: 0.25 + saturation: 0.25 + hue: 0.1 + p: 0.5 diff --git a/config/augmentation/template/common.yaml b/config/augmentation/template/common.yaml index 48dae02f..881cb816 100644 --- a/config/augmentation/template/common.yaml +++ b/config/augmentation/template/common.yaml @@ -1,31 +1,27 @@ augmentation: - img_size: &img_size 512 - hsv_h: ~ - hsv_s: ~ - hsv_v: ~ - degrees: ~ - translate: ~ - scale: ~ - max_scale: 1024 - min_scale: *img_size - crop_size_h: *img_size - crop_size_w: *img_size - resize_ratio0: 1.0 - resize_ratiof: 1.5 - resize_add: 1 - shear: ~ - perspective: ~ - flipud: ~ - fliplr: 0.5 - mosaic: ~ - mixup: ~ - copy_paste: ~ - mixup_alpha: ~ - cutmix_alpha: ~ - mixup_switch_prob: ~ - color_jitter: - brightness: 0.25 - contrast: 0.25 - saturation: 0.25 - hue: 0.1 - colorjitter_p: 0.5 \ No newline at end of file + img_size: &img_size ~ + transforms: + - + name: randomresizedcrop + size: ~ + interpolation: bilinear + - + name: randomhorizontalflip + p: ~ + - + name: randomverticalflip + p: ~ + - + name: colorjitter + brightness: ~ + contrast: ~ + saturation: ~ + hue: ~ + p: ~ + - + name: resize + size: ~ + - + name: pad + padding: ~ + mix_transforms: ~ \ No newline at end of file diff --git a/config/model/efficientformer/efficientformer-l1-classification.yaml b/config/model/efficientformer/efficientformer-l1-classification.yaml index 681794cc..b7f51e53 100644 --- a/config/model/efficientformer/efficientformer-l1-classification.yaml +++ b/config/model/efficientformer/efficientformer-l1-classification.yaml @@ -1,5 +1,6 @@ model: task: classification + name: efficientformer_l1 checkpoint: ./weights/efficientformer/efficientformer_l1_1000d.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -8,29 +9,44 @@ model: full: ~ # auto backbone: name: efficientformer - num_blocks: [3, 2, 6, 4] - hidden_sizes: [48, 96, 224, 448] - num_attention_heads: 8 - attention_hidden_size: 256 # attention_hidden_size_splitted * num_attention_heads - attention_dropout_prob: 0. - attention_ratio: 4 - attention_bias_resolution: 16 - pool_size: 3 - intermediate_ratio: 4 - hidden_dropout_prob: 0. - hidden_activation_type: 'gelu' - layer_norm_eps: 1e-5 - drop_path_rate: 0. - use_layer_scale: True - layer_scale_init_value: 1e-5 - downsamples: [True, True, True, True] - down_patch_size: 3 - down_stride: 2 - down_pad: 1 - vit_num: 1 + params: + num_attention_heads: 8 + attention_hidden_size: 256 # attention_hidden_size_splitted * num_attention_heads + attention_dropout_prob: 0. + attention_ratio: 4 + attention_bias_resolution: 16 + pool_size: 3 + intermediate_ratio: 4 + hidden_dropout_prob: 0. + hidden_activation_type: 'gelu' + layer_norm_eps: 1e-5 + drop_path_rate: 0. + use_layer_scale: True + layer_scale_init_value: 1e-5 + down_patch_size: 3 + down_stride: 2 + down_pad: 1 + vit_num: 1 + stage_params: + - + num_blocks: 3 + hidden_sizes: 48 + downsamples: True + - + num_blocks: 2 + hidden_sizes: 96 + downsamples: True + - + num_blocks: 6 + hidden_sizes: 224 + downsamples: True + - + num_blocks: 4 + hidden_sizes: 448 + downsamples: True head: name: fc losses: - - criterion: label_smoothing_cross_entropy - smoothing: 0.1 + - criterion: cross_entropy + label_smoothing: 0.1 weight: ~ \ No newline at end of file diff --git a/config/model/efficientformer/efficientformer-l1-detection.yaml b/config/model/efficientformer/efficientformer-l1-detection.yaml index e81da42f..9a3b339f 100644 --- a/config/model/efficientformer/efficientformer-l1-detection.yaml +++ b/config/model/efficientformer/efficientformer-l1-detection.yaml @@ -1,5 +1,6 @@ model: task: detection + name: efficientformer_l1 checkpoint: ./weights/efficientformer/efficientformer_l1_1000d.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -8,26 +9,43 @@ model: full: ~ # auto backbone: name: efficientformer - num_blocks: [3, 2, 6, 4] - hidden_sizes: [48, 96, 224, 448] - num_attention_heads: 8 - attention_hidden_size: 256 # attention_hidden_size_splitted * num_attention_heads - attention_dropout_prob: 0. - attention_ratio: 4 - attention_bias_resolution: 16 - pool_size: 3 - intermediate_ratio: 4 - hidden_dropout_prob: 0. - hidden_activation_type: 'gelu' - layer_norm_eps: 1e-5 - drop_path_rate: 0. - use_layer_scale: True - layer_scale_init_value: 1e-5 - downsamples: [True, True, True, True] - down_patch_size: 3 - down_stride: 2 - down_pad: 1 - vit_num: 1 + params: + num_attention_heads: 8 + attention_hidden_size: 256 # attention_hidden_size_splitted * num_attention_heads + attention_dropout_prob: 0. + attention_ratio: 4 + attention_bias_resolution: 16 + pool_size: 3 + intermediate_ratio: 4 + hidden_dropout_prob: 0. + hidden_activation_type: 'gelu' + layer_norm_eps: 1e-5 + drop_path_rate: 0. + use_layer_scale: True + layer_scale_init_value: 1e-5 + down_patch_size: 3 + down_stride: 2 + down_pad: 1 + vit_num: 1 + stage_params: + - + num_blocks: 3 + hidden_sizes: 48 + downsamples: True + - + num_blocks: 2 + hidden_sizes: 96 + downsamples: True + - + num_blocks: 6 + hidden_sizes: 224 + downsamples: True + - + num_blocks: 4 + hidden_sizes: 448 + downsamples: True + neck: + name: fpn head: name: faster_rcnn losses: diff --git a/config/model/efficientformer/efficientformer-l1-segmentation.yaml b/config/model/efficientformer/efficientformer-l1-segmentation.yaml index 029abf54..b28718f9 100644 --- a/config/model/efficientformer/efficientformer-l1-segmentation.yaml +++ b/config/model/efficientformer/efficientformer-l1-segmentation.yaml @@ -1,5 +1,6 @@ model: task: segmentation + name: efficientformer_l1 checkpoint: ./weights/efficientformer/efficientformer_l1_1000d.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -8,26 +9,41 @@ model: full: ~ # auto backbone: name: efficientformer - num_blocks: [3, 2, 6, 4] - hidden_sizes: [48, 96, 224, 448] - num_attention_heads: 8 - attention_hidden_size: 256 # attention_hidden_size_splitted * num_attention_heads - attention_dropout_prob: 0. - attention_ratio: 4 - attention_bias_resolution: 16 - pool_size: 3 - intermediate_ratio: 4 - hidden_dropout_prob: 0. - hidden_activation_type: 'gelu' - layer_norm_eps: 1e-5 - drop_path_rate: 0. - use_layer_scale: True - layer_scale_init_value: 1e-5 - downsamples: [True, True, True, True] - down_patch_size: 3 - down_stride: 2 - down_pad: 1 - vit_num: 1 + params: + num_attention_heads: 8 + attention_hidden_size: 256 # attention_hidden_size_splitted * num_attention_heads + attention_dropout_prob: 0. + attention_ratio: 4 + attention_bias_resolution: 16 + pool_size: 3 + intermediate_ratio: 4 + hidden_dropout_prob: 0. + hidden_activation_type: 'gelu' + layer_norm_eps: 1e-5 + drop_path_rate: 0. + use_layer_scale: True + layer_scale_init_value: 1e-5 + down_patch_size: 3 + down_stride: 2 + down_pad: 1 + vit_num: 1 + stage_params: + - + num_blocks: 3 + hidden_sizes: 48 + downsamples: True + - + num_blocks: 2 + hidden_sizes: 96 + downsamples: True + - + num_blocks: 6 + hidden_sizes: 224 + downsamples: True + - + num_blocks: 4 + hidden_sizes: 448 + downsamples: True head: name: all_mlp_decoder losses: diff --git a/config/model/mixnet/mixnet-l-classification.yaml b/config/model/mixnet/mixnet-l-classification.yaml new file mode 100644 index 00000000..80aed0f4 --- /dev/null +++ b/config/model/mixnet/mixnet-l-classification.yaml @@ -0,0 +1,67 @@ +model: + task: classification + name: mixnet_l + checkpoint: ./weights/mixnet/mixnet_l.pth + fx_model_checkpoint: ~ + resume_optimizer_checkpoint: ~ + freeze_backbone: False + architecture: + full: ~ # auto + backbone: + name: mixnet + params: + stem_planes: 24 + width_multi: 1.3 + depth_multi: 1.0 + dropout_rate: 0. + stage_params: + - + expand_ratio: [1, 6, 3] + out_channels: [24, 32, 32] + num_blocks: [1, 1, 1] + kernel_sizes: [[3], [3, 5, 7], [3]] + exp_kernel_sizes: [[1], [1, 1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1], [1, 1]] + stride: [1, 2, 1] + dilation: [1, 1, 1] + act_type: ["relu", "relu", "relu"] + se_reduction_ratio: [~, ~, ~] + - + expand_ratio: [6, 6] + out_channels: [40, 40] + num_blocks: [1, 3] + kernel_sizes: [[3, 5, 7, 9], [3, 5]] + exp_kernel_sizes: [[1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + - + expand_ratio: [6, 6, 6, 3] + out_channels: [80, 80, 120, 120] + num_blocks: [1, 3, 1, 3] + kernel_sizes: [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1, 1], [1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1], [1], [1, 1]] + stride: [2, 1, 1, 1] + dilation: [1, 1, 1, 1] + act_type: ["swish", "swish", "swish", "swish"] + se_reduction_ratio: [4, 4, 2, 2] + - + expand_ratio: [6, 6] + out_channels: [200, 200] + num_blocks: [1, 3] + kernel_sizes: [[3, 5, 7, 9], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + head: + name: fc + losses: + - criterion: cross_entropy + label_smoothing: 0.1 + weight: ~ \ No newline at end of file diff --git a/config/model/mixnet/mixnet-l-segmentation.yaml b/config/model/mixnet/mixnet-l-segmentation.yaml new file mode 100644 index 00000000..623e4675 --- /dev/null +++ b/config/model/mixnet/mixnet-l-segmentation.yaml @@ -0,0 +1,67 @@ +model: + task: segmentation + name: mixnet_l + checkpoint: ./weights/mixnet/mixnet_l.pth + fx_model_checkpoint: ~ + resume_optimizer_checkpoint: ~ + freeze_backbone: False + architecture: + full: ~ # auto + backbone: + name: mixnet + params: + stem_planes: 24 + width_multi: 1.3 + depth_multi: 1.0 + dropout_rate: 0. + stage_params: + - + expand_ratio: [1, 6, 3] + out_channels: [24, 32, 32] + num_blocks: [1, 1, 1] + kernel_sizes: [[3], [3, 5, 7], [3]] + exp_kernel_sizes: [[1], [1, 1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1], [1, 1]] + stride: [1, 2, 1] + dilation: [1, 1, 1] + act_type: ["relu", "relu", "relu"] + se_reduction_ratio: [~, ~, ~] + - + expand_ratio: [6, 6] + out_channels: [40, 40] + num_blocks: [1, 3] + kernel_sizes: [[3, 5, 7, 9], [3, 5]] + exp_kernel_sizes: [[1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + - + expand_ratio: [6, 6, 6, 3] + out_channels: [80, 80, 120, 120] + num_blocks: [1, 3, 1, 3] + kernel_sizes: [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1, 1], [1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1], [1], [1, 1]] + stride: [2, 1, 1, 1] + dilation: [1, 1, 1, 1] + act_type: ["swish", "swish", "swish", "swish"] + se_reduction_ratio: [4, 4, 2, 2] + - + expand_ratio: [6, 6] + out_channels: [200, 200] + num_blocks: [1, 3] + kernel_sizes: [[3, 5, 7, 9], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + head: + name: all_mlp_decoder + losses: + - criterion: cross_entropy + weight: ~ + ignore_index: 255 \ No newline at end of file diff --git a/config/model/mixnet/mixnet-m-classification.yaml b/config/model/mixnet/mixnet-m-classification.yaml new file mode 100644 index 00000000..c41088ef --- /dev/null +++ b/config/model/mixnet/mixnet-m-classification.yaml @@ -0,0 +1,67 @@ +model: + task: classification + name: mixnet_m + checkpoint: ./weights/mixnet/mixnet_m.pth + fx_model_checkpoint: ~ + resume_optimizer_checkpoint: ~ + freeze_backbone: False + architecture: + full: ~ # auto + backbone: + name: mixnet + params: + stem_planes: 24 + width_multi: 1.0 + depth_multi: 1.0 + dropout_rate: 0. + stage_params: + - + expand_ratio: [1, 6, 3] + out_channels: [24, 32, 32] + num_blocks: [1, 1, 1] + kernel_sizes: [[3], [3, 5, 7], [3]] + exp_kernel_sizes: [[1], [1, 1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1], [1, 1]] + stride: [1, 2, 1] + dilation: [1, 1, 1] + act_type: ["relu", "relu", "relu"] + se_reduction_ratio: [~, ~, ~] + - + expand_ratio: [6, 6] + out_channels: [40, 40] + num_blocks: [1, 3] + kernel_sizes: [[3, 5, 7, 9], [3, 5]] + exp_kernel_sizes: [[1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + - + expand_ratio: [6, 6, 6, 3] + out_channels: [80, 80, 120, 120] + num_blocks: [1, 3, 1, 3] + kernel_sizes: [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1, 1], [1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1], [1], [1, 1]] + stride: [2, 1, 1, 1] + dilation: [1, 1, 1, 1] + act_type: ["swish", "swish", "swish", "swish"] + se_reduction_ratio: [4, 4, 2, 2] + - + expand_ratio: [6, 6] + out_channels: [200, 200] + num_blocks: [1, 3] + kernel_sizes: [[3, 5, 7, 9], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + head: + name: fc + losses: + - criterion: cross_entropy + label_smoothing: 0.1 + weight: ~ \ No newline at end of file diff --git a/config/model/mixnet/mixnet-m-segmentation.yaml b/config/model/mixnet/mixnet-m-segmentation.yaml new file mode 100644 index 00000000..affd2b9a --- /dev/null +++ b/config/model/mixnet/mixnet-m-segmentation.yaml @@ -0,0 +1,67 @@ +model: + task: segmentation + name: mixnet_m + checkpoint: ./weights/mixnet/mixnet_m.pth + fx_model_checkpoint: ~ + resume_optimizer_checkpoint: ~ + freeze_backbone: False + architecture: + full: ~ # auto + backbone: + name: mixnet + params: + stem_planes: 24 + width_multi: 1.0 + depth_multi: 1.0 + dropout_rate: 0. + stage_params: + - + expand_ratio: [1, 6, 3] + out_channels: [24, 32, 32] + num_blocks: [1, 1, 1] + kernel_sizes: [[3], [3, 5, 7], [3]] + exp_kernel_sizes: [[1], [1, 1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1], [1, 1]] + stride: [1, 2, 1] + dilation: [1, 1, 1] + act_type: ["relu", "relu", "relu"] + se_reduction_ratio: [~, ~, ~] + - + expand_ratio: [6, 6] + out_channels: [40, 40] + num_blocks: [1, 3] + kernel_sizes: [[3, 5, 7, 9], [3, 5]] + exp_kernel_sizes: [[1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + - + expand_ratio: [6, 6, 6, 3] + out_channels: [80, 80, 120, 120] + num_blocks: [1, 3, 1, 3] + kernel_sizes: [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1, 1], [1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1], [1], [1, 1]] + stride: [2, 1, 1, 1] + dilation: [1, 1, 1, 1] + act_type: ["swish", "swish", "swish", "swish"] + se_reduction_ratio: [4, 4, 2, 2] + - + expand_ratio: [6, 6] + out_channels: [200, 200] + num_blocks: [1, 3] + kernel_sizes: [[3, 5, 7, 9], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + head: + name: all_mlp_decoder + losses: + - criterion: cross_entropy + weight: ~ + ignore_index: 255 \ No newline at end of file diff --git a/config/model/mixnet/mixnet-s-classification.yaml b/config/model/mixnet/mixnet-s-classification.yaml new file mode 100644 index 00000000..e4c54cc0 --- /dev/null +++ b/config/model/mixnet/mixnet-s-classification.yaml @@ -0,0 +1,67 @@ +model: + task: classification + name: mixnet_s + checkpoint: ./weights/mixnet/mixnet_s.pth + fx_model_checkpoint: ~ + resume_optimizer_checkpoint: ~ + freeze_backbone: False + architecture: + full: ~ # auto + backbone: + name: mixnet + params: + stem_planes: 16 + width_multi: 1.0 + depth_multi: 1.0 + dropout_rate: 0. + stage_params: + - + expand_ratio: [1, 6, 3] + out_channels: [16, 24, 24] + num_blocks: [1, 1, 1] + kernel_sizes: [[3], [3], [3]] + exp_kernel_sizes: [[1], [1, 1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1], [1, 1]] + stride: [1, 2, 1] + dilation: [1, 1, 1] + act_type: ["relu", "relu", "relu"] + se_reduction_ratio: [~, ~, ~] + - + expand_ratio: [6, 6] + out_channels: [40, 40] + num_blocks: [1, 3] + kernel_sizes: [[3, 5, 7], [3, 5]] + exp_kernel_sizes: [[1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + - + expand_ratio: [6, 6, 6, 3] + out_channels: [80, 80, 120, 120] + num_blocks: [1, 2, 1, 2] + kernel_sizes: [[3, 5, 7], [3, 5], [3, 5, 7], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1], [1, 1], [1, 1]] + poi_kernel_sizes: [[1, 1], [1, 1], [1, 1], [1, 1]] + stride: [2, 1, 1, 1] + dilation: [1, 1, 1, 1] + act_type: ["swish", "swish", "swish", "swish"] + se_reduction_ratio: [4, 4, 2, 2] + - + expand_ratio: [6, 6] + out_channels: [200, 200] + num_blocks: [1, 2] + kernel_sizes: [[3, 5, 7, 9, 11], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + head: + name: fc + losses: + - criterion: cross_entropy + label_smoothing: 0.1 + weight: ~ \ No newline at end of file diff --git a/config/model/mixnet/mixnet-s-segmentation.yaml b/config/model/mixnet/mixnet-s-segmentation.yaml new file mode 100644 index 00000000..dd8cdeb9 --- /dev/null +++ b/config/model/mixnet/mixnet-s-segmentation.yaml @@ -0,0 +1,67 @@ +model: + task: segmentation + name: mixnet_s + checkpoint: ./weights/mixnet/mixnet_s.pth + fx_model_checkpoint: ~ + resume_optimizer_checkpoint: ~ + freeze_backbone: False + architecture: + full: ~ # auto + backbone: + name: mixnet + params: + stem_planes: 16 + width_multi: 1.0 + depth_multi: 1.0 + dropout_rate: 0. + stage_params: + - + expand_ratio: [1, 6, 3] + out_channels: [16, 24, 24] + num_blocks: [1, 1, 1] + kernel_sizes: [[3], [3], [3]] + exp_kernel_sizes: [[1], [1, 1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1], [1, 1]] + stride: [1, 2, 1] + dilation: [1, 1, 1] + act_type: ["relu", "relu", "relu"] + se_reduction_ratio: [~, ~, ~] + - + expand_ratio: [6, 6] + out_channels: [40, 40] + num_blocks: [1, 3] + kernel_sizes: [[3, 5, 7], [3, 5]] + exp_kernel_sizes: [[1], [1, 1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + - + expand_ratio: [6, 6, 6, 3] + out_channels: [80, 80, 120, 120] + num_blocks: [1, 2, 1, 2] + kernel_sizes: [[3, 5, 7], [3, 5], [3, 5, 7], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1], [1, 1], [1, 1]] + poi_kernel_sizes: [[1, 1], [1, 1], [1, 1], [1, 1]] + stride: [2, 1, 1, 1] + dilation: [1, 1, 1, 1] + act_type: ["swish", "swish", "swish", "swish"] + se_reduction_ratio: [4, 4, 2, 2] + - + expand_ratio: [6, 6] + out_channels: [200, 200] + num_blocks: [1, 2] + kernel_sizes: [[3, 5, 7, 9, 11], [3, 5, 7, 9]] + exp_kernel_sizes: [[1], [1]] + poi_kernel_sizes: [[1], [1, 1]] + stride: [2, 1] + dilation: [1, 1] + act_type: ["swish", "swish"] + se_reduction_ratio: [2, 2] + head: + name: all_mlp_decoder + losses: + - criterion: cross_entropy + weight: ~ + ignore_index: 255 \ No newline at end of file diff --git a/config/model/mobilenetv3/mobilenetv3-small-classification.yaml b/config/model/mobilenetv3/mobilenetv3-small-classification.yaml index 27398828..6ddd1ff2 100644 --- a/config/model/mobilenetv3/mobilenetv3-small-classification.yaml +++ b/config/model/mobilenetv3/mobilenetv3-small-classification.yaml @@ -1,5 +1,6 @@ model: task: classification + name: mobilenet_v3_small checkpoint: ./weights/mobilenetv3/mobilenet_v3_small.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -7,26 +8,48 @@ model: architecture: full: ~ # auto backbone: - name: mobilenetv3_small - block_info: # [in_channels, kernel, expended_channels, out_channels, use_se, activation, stride, dilation] + name: mobilenetv3 + params: ~ + stage_params: - - - [16, 3, 16, 16, True, "relu", 2, 1] + in_channels: [16] + kernel: [3] + expanded_channels: [16] + out_channels: [16] + use_se: [True] + activation: ["relu"] + stride: [2] + dilation: [1] - - - [16, 3, 72, 24, False, "relu", 2, 1] - - [24, 3, 88, 24, False, "relu", 1, 1] + in_channels: [16, 24] + kernel: [3, 3] + expanded_channels: [72, 88] + out_channels: [24, 24] + use_se: [False, False] + activation: ["relu", "relu"] + stride: [2, 1] + dilation: [1, 1] - - - [24, 5, 96, 40, True, "hard_swish", 2, 1] - - [40, 5, 240, 40, True, "hard_swish", 1, 1] - - [40, 5, 240, 40, True, "hard_swish", 1, 1] - - [40, 5, 120, 48, True, "hard_swish", 1, 1] - - [48, 5, 144, 48, True, "hard_swish", 1, 1] + in_channels: [24, 40, 40, 40, 48] + kernel: [5, 5, 5, 5, 5] + expanded_channels: [96, 240, 240, 120, 144] + out_channels: [40, 40, 40, 48, 48] + use_se: [True, True, True, True, True] + activation: ["hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish"] + stride: [2, 1, 1, 1, 1] + dilation: [1, 1, 1, 1, 1] - - - [48, 5, 288, 96, True, "hard_swish", 2, 1] - - [96, 5, 576, 96, True, "hard_swish", 1, 1] - - [96, 5, 576, 96, True, "hard_swish", 1, 1] + in_channels: [48, 96, 96] + kernel: [5, 5, 5] + expanded_channels: [288, 576, 576] + out_channels: [96, 96, 96] + use_se: [True, True, True] + activation: ["hard_swish", "hard_swish", "hard_swish"] + stride: [2, 1, 1] + dilation: [1, 1, 1] head: name: fc losses: - - criterion: label_smoothing_cross_entropy - smoothing: 0.1 + - criterion: cross_entropy + label_smoothing: 0.1 weight: ~ \ No newline at end of file diff --git a/config/model/mobilenetv3/mobilenetv3-small-segmentation.yaml b/config/model/mobilenetv3/mobilenetv3-small-segmentation.yaml index 58aceec8..6c8438fc 100644 --- a/config/model/mobilenetv3/mobilenetv3-small-segmentation.yaml +++ b/config/model/mobilenetv3/mobilenetv3-small-segmentation.yaml @@ -1,5 +1,6 @@ model: task: segmentation + name: mobilenet_v3_small checkpoint: ./weights/mobilenetv3/mobilenet_v3_small.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -7,23 +8,45 @@ model: architecture: full: ~ # auto backbone: - name: mobilenetv3_small - block_info: # [in_channels, kernel, expended_channels, out_channels, use_se, activation, stride, dilation] + name: mobilenetv3 + params: ~ + stage_params: - - - [16, 3, 16, 16, True, "relu", 2, 1] + in_channels: [16] + kernel: [3] + expanded_channels: [16] + out_channels: [16] + use_se: [True] + activation: ["relu"] + stride: [2] + dilation: [1] - - - [16, 3, 72, 24, False, "relu", 2, 1] - - [24, 3, 88, 24, False, "relu", 1, 1] + in_channels: [16, 24] + kernel: [3, 3] + expanded_channels: [72, 88] + out_channels: [24, 24] + use_se: [False, False] + activation: ["relu", "relu"] + stride: [2, 1] + dilation: [1, 1] - - - [24, 5, 96, 40, True, "hard_swish", 2, 1] - - [40, 5, 240, 40, True, "hard_swish", 1, 1] - - [40, 5, 240, 40, True, "hard_swish", 1, 1] - - [40, 5, 120, 48, True, "hard_swish", 1, 1] - - [48, 5, 144, 48, True, "hard_swish", 1, 1] + in_channels: [24, 40, 40, 40, 48] + kernel: [5, 5, 5, 5, 5] + expanded_channels: [96, 240, 240, 120, 144] + out_channels: [40, 40, 40, 48, 48] + use_se: [True, True, True, True, True] + activation: ["hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish"] + stride: [2, 1, 1, 1, 1] + dilation: [1, 1, 1, 1, 1] - - - [48, 5, 288, 96, True, "hard_swish", 2, 1] - - [96, 5, 576, 96, True, "hard_swish", 1, 1] - - [96, 5, 576, 96, True, "hard_swish", 1, 1] + in_channels: [48, 96, 96] + kernel: [5, 5, 5] + expanded_channels: [288, 576, 576] + out_channels: [96, 96, 96] + use_se: [True, True, True] + activation: ["hard_swish", "hard_swish", "hard_swish"] + stride: [2, 1, 1] + dilation: [1, 1, 1] head: name: all_mlp_decoder losses: diff --git a/config/model/mobilevit/mobilevit-s-classification.yaml b/config/model/mobilevit/mobilevit-s-classification.yaml index 17bcc1cf..6e21b48c 100644 --- a/config/model/mobilevit/mobilevit-s-classification.yaml +++ b/config/model/mobilevit/mobilevit-s-classification.yaml @@ -1,5 +1,6 @@ model: task: classification + name: mobilevit_s checkpoint: ./weights/mobilevit/mobilevit_s.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -8,27 +9,70 @@ model: full: ~ # auto backbone: name: mobilevit - out_channels: [32, 64, 96, 128, 160] - block_type: ['mv2', 'mv2', 'mobilevit', 'mobilevit', 'mobilevit'] - num_blocks: [1, 3, None, None, None] - stride: [1, 2, 2, 2, 2] - hidden_size: [None, None, 144, 192, 240] - intermediate_size: [None, None, 288, 384, 480] - num_transformer_blocks: [None, None, 2, 4, 3] - dilate: [None, None, False, False, False] - expand_ratio: [4, 4, 4, 4, 4] # [mv2_exp_mult] * 4 - patch_embedding_out_channels: 16 - local_kernel_size: 3 - patch_size: 2 - num_attention_heads: 4 # num_heads - attention_dropout_prob: 0.1 - hidden_dropout_prob: 0.0 - exp_factor: 4 - layer_norm_eps: 1e-5 - use_fusion_layer: True + params: + patch_embedding_out_channels: 16 + local_kernel_size: 3 + patch_size: 2 + num_attention_heads: 4 # num_heads + attention_dropout_prob: 0.1 + hidden_dropout_prob: 0.0 + exp_factor: 4 + layer_norm_eps: 1e-5 + use_fusion_layer: True + stage_params: + - + out_channels: 32 + block_type: 'mv2' + num_blocks: 1 + stride: 1 + hidden_size: None + intermediate_size: None + num_transformer_blocks: None + dilate: None + expand_ratio: 4 # [mv2_exp_mult] * 4 + - + out_channels: 64 + block_type: 'mv2' + num_blocks: 3 + stride: 2 + hidden_size: None + intermediate_size: None + num_transformer_blocks: None + dilate: None + expand_ratio: 4 # [mv2_exp_mult] * 4 + - + out_channels: 96 + block_type: 'mobilevit' + num_blocks: None + stride: 2 + hidden_size: 144 + intermediate_size: 288 + num_transformer_blocks: 2 + dilate: False + expand_ratio: 4 # [mv2_exp_mult] * 4 + - + out_channels: 128 + block_type: 'mobilevit' + num_blocks: None + stride: 2 + hidden_size: 192 + intermediate_size: 384 + num_transformer_blocks: 4 + dilate: False + expand_ratio: 4 # [mv2_exp_mult] * 4 + - + out_channels: 160 + block_type: 'mobilevit' + num_blocks: None + stride: 2 + hidden_size: 240 + intermediate_size: 480 + num_transformer_blocks: 3 + dilate: False + expand_ratio: 4 # [mv2_exp_mult] * 4 head: name: fc losses: - - criterion: label_smoothing_cross_entropy - smoothing: 0.1 + - criterion: cross_entropy + label_smoothing: 0.1 weight: ~ \ No newline at end of file diff --git a/config/model/pidnet/pidnet-s-segmentation.yaml b/config/model/pidnet/pidnet-s-segmentation.yaml index 52223fcf..2cbaf179 100644 --- a/config/model/pidnet/pidnet-s-segmentation.yaml +++ b/config/model/pidnet/pidnet-s-segmentation.yaml @@ -1,5 +1,6 @@ model: task: segmentation + name: pidnet_s checkpoint: ./weights/pidnet/pidnet_s.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ diff --git a/config/model/resnet/resnet50-classification.yaml b/config/model/resnet/resnet50-classification.yaml index 10c9bc99..a781931f 100644 --- a/config/model/resnet/resnet50-classification.yaml +++ b/config/model/resnet/resnet50-classification.yaml @@ -1,5 +1,6 @@ model: task: classification + name: resnet50 checkpoint: ./weights/resnet/resnet50.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -7,12 +8,33 @@ model: architecture: full: ~ # auto backbone: - name: resnet50 - block: bottleneck - layers: [3, 4, 6, 3] + name: resnet + params: + block: bottleneck + norm_layer: batch_norm + groups: 1 + width_per_group: 64 + zero_init_residual: False + expansion: ~ + stage_params: + - + plane: 64 + layers: 3 + - + plane: 128 + layers: 4 + replace_stride_with_dilation: False + - + plane: 256 + layers: 6 + replace_stride_with_dilation: False + - + plane: 512 + layers: 3 + replace_stride_with_dilation: False head: name: fc losses: - - criterion: label_smoothing_cross_entropy - smoothing: 0.1 + - criterion: cross_entropy + label_smoothing: 0.1 weight: ~ \ No newline at end of file diff --git a/config/model/resnet/resnet50-segmentation.yaml b/config/model/resnet/resnet50-segmentation.yaml index 3f83f708..00212984 100644 --- a/config/model/resnet/resnet50-segmentation.yaml +++ b/config/model/resnet/resnet50-segmentation.yaml @@ -1,5 +1,6 @@ model: task: segmentation + name: resnet50 checkpoint: ./weights/resnet/resnet50.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -8,9 +9,30 @@ model: full: name: ~ # auto backbone: - name: resnet50 - block: bottleneck - layers: [3, 4, 6, 3] + name: resnet + params: + block: bottleneck + norm_layer: batch_norm + groups: 1 + width_per_group: 64 + zero_init_residual: False + expansion: ~ + stage_params: + - + plane: 64 + layers: 3 + - + plane: 128 + layers: 4 + replace_stride_with_dilation: False + - + plane: 256 + layers: 6 + replace_stride_with_dilation: False + - + plane: 512 + layers: 3 + replace_stride_with_dilation: False head: name: all_mlp_decoder losses: diff --git a/config/model/segformer/segformer-classification.yaml b/config/model/segformer/segformer-classification.yaml index e669de24..72eea484 100644 --- a/config/model/segformer/segformer-classification.yaml +++ b/config/model/segformer/segformer-classification.yaml @@ -1,5 +1,6 @@ model: task: classification + name: segformer checkpoint: ./weights/segformer/segformer.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -8,21 +9,44 @@ model: full: ~ # auto backbone: name: segformer - num_modules: 4 # `num_encoder_blocks` in original - num_blocks: [2, 2, 2, 2] # `depth` in original - sr_ratios: [8, 4, 2, 1] - hidden_sizes: [32, 64, 160, 256] - embedding_patch_sizes: [7, 3, 3, 3] - embedding_strides: [4, 2, 2, 2] - num_attention_heads: [1, 2, 5, 8] - intermediate_ratio: 4 - hidden_activation_type: "gelu" - hidden_dropout_prob: 0.0 - attention_dropout_prob: 0.0 - layer_norm_eps: 1e-5 + params: + intermediate_ratio: 4 + hidden_activation_type: "gelu" + hidden_dropout_prob: 0.0 + attention_dropout_prob: 0.0 + layer_norm_eps: 1e-5 + stage_params: + - + num_blocks: 2 + sr_ratios: 8 + hidden_sizes: 32 + embedding_patch_sizes: 7 + embedding_strides: 4 + num_attention_heads: 1 + - + num_blocks: 2 + sr_ratios: 4 + hidden_sizes: 64 + embedding_patch_sizes: 3 + embedding_strides: 2 + num_attention_heads: 2 + - + num_blocks: 2 + sr_ratios: 2 + hidden_sizes: 160 + embedding_patch_sizes: 3 + embedding_strides: 2 + num_attention_heads: 5 + - + num_blocks: 2 + sr_ratios: 1 + hidden_sizes: 256 + embedding_patch_sizes: 3 + embedding_strides: 2 + num_attention_heads: 8 head: name: fc losses: - - criterion: label_smoothing_cross_entropy - smoothing: 0.1 + - criterion: cross_entropy + label_smoothing: 0.1 weight: ~ \ No newline at end of file diff --git a/config/model/segformer/segformer-segmentation.yaml b/config/model/segformer/segformer-segmentation.yaml index bb990dfc..589d31ac 100644 --- a/config/model/segformer/segformer-segmentation.yaml +++ b/config/model/segformer/segformer-segmentation.yaml @@ -1,5 +1,6 @@ model: task: segmentation + name: segformer checkpoint: ./weights/segformer/segformer.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -8,18 +9,41 @@ model: full: ~ # auto backbone: name: segformer - num_modules: 4 # `num_encoder_blocks` in original - num_blocks: [2, 2, 2, 2] # `depth` in original - sr_ratios: [8, 4, 2, 1] - hidden_sizes: [32, 64, 160, 256] - embedding_patch_sizes: [7, 3, 3, 3] - embedding_strides: [4, 2, 2, 2] - num_attention_heads: [1, 2, 5, 8] - intermediate_ratio: 4 - hidden_activation_type: "gelu" - hidden_dropout_prob: 0.0 - attention_dropout_prob: 0.0 - layer_norm_eps: 1e-5 + params: + intermediate_ratio: 4 + hidden_activation_type: "gelu" + hidden_dropout_prob: 0.0 + attention_dropout_prob: 0.0 + layer_norm_eps: 1e-5 + stage_params: + - + num_blocks: 2 + sr_ratios: 8 + hidden_sizes: 32 + embedding_patch_sizes: 7 + embedding_strides: 4 + num_attention_heads: 1 + - + num_blocks: 2 + sr_ratios: 4 + hidden_sizes: 64 + embedding_patch_sizes: 3 + embedding_strides: 2 + num_attention_heads: 2 + - + num_blocks: 2 + sr_ratios: 2 + hidden_sizes: 160 + embedding_patch_sizes: 3 + embedding_strides: 2 + num_attention_heads: 5 + - + num_blocks: 2 + sr_ratios: 1 + hidden_sizes: 256 + embedding_patch_sizes: 3 + embedding_strides: 2 + num_attention_heads: 8 head: name: all_mlp_decoder losses: diff --git a/config/model/vit/vit-classification.yaml b/config/model/vit/vit-classification.yaml index 2fa9a0a8..5b0e063f 100644 --- a/config/model/vit/vit-classification.yaml +++ b/config/model/vit/vit-classification.yaml @@ -1,5 +1,6 @@ model: task: classification + name: vit_tiny checkpoint: ./weights/vit/vit-tiny.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -8,16 +9,21 @@ model: full: ~ # auto backbone: name: vit - patch_size: 16 - hidden_size: 192 - num_blocks: 12 - num_attention_heads: 3 - attention_dropout_prob: 0.0 - intermediate_size: 768 # hidden_size * 4 - hidden_dropout_prob: 0.1 + params: + patch_size: 16 + hidden_size: 192 + num_blocks: 12 + num_attention_heads: 3 + attention_dropout_prob: 0.0 + intermediate_size: 768 # hidden_size * 4 + hidden_dropout_prob: 0.1 + layer_norm_eps: 1e-6 + use_cls_token: True + vocab_size: 1000 + stage_params: ~ head: name: fc losses: - - criterion: label_smoothing_cross_entropy - smoothing: 0.1 + - criterion: cross_entropy + label_smoothing: 0.1 weight: ~ \ No newline at end of file diff --git a/config/model/yolox/yolox-detection.yaml b/config/model/yolox/yolox-detection.yaml index a5502fba..67137cfa 100644 --- a/config/model/yolox/yolox-detection.yaml +++ b/config/model/yolox/yolox-detection.yaml @@ -1,5 +1,6 @@ model: task: detection + name: yolox_s checkpoint: ./weights/yolox/yolox_s.pth fx_model_checkpoint: ~ resume_optimizer_checkpoint: ~ @@ -8,10 +9,15 @@ model: full: ~ # auto backbone: name: cspdarknet - dep_mul: 0.33 - wid_mul: 0.5 + params: + dep_mul: 0.33 + wid_mul: 0.5 + act_type: "silu" + stage_params: ~ + neck: + name: pafpn head: - name: yolo_head + name: yolox_head losses: - criterion: yolox_loss weight: ~ \ No newline at end of file diff --git a/demo/gradio_augmentation.py b/demo/gradio_augmentation.py index e885dab7..5c7cc71b 100644 --- a/demo/gradio_augmentation.py +++ b/demo/gradio_augmentation.py @@ -25,7 +25,7 @@ def summary_transform(phase, task, model_name, yaml_str): try: conf = OmegaConf.create(yaml_str) is_training = (phase == 'train') - transform = CREATE_TRANSFORM[task](model_name, is_training=is_training) + transform = CREATE_TRANSFORM(model_name, is_training=is_training) transform_composed = transform(conf.augmentation) return str(transform_composed) except Exception as e: @@ -37,7 +37,7 @@ def get_augmented_images(phase, task, model_name, yaml_str, test_image, try: conf = OmegaConf.create(yaml_str) is_training = (phase == 'train') - transform = CREATE_TRANSFORM[task](model_name, is_training=is_training) + transform = CREATE_TRANSFORM(model_name, is_training=is_training) transform_composed = transform(conf.augmentation) transformed_images = [transform_composed(test_image, @@ -88,7 +88,7 @@ def launch_gradio(args): task_choices = gr.Radio(label="Task: ", value='classification', choices=SUPPORTING_TASK_LIST) with gr.Column(scale=1): phase_choices = gr.Radio(label="Phase: ", value='train', choices=['train', 'valid']) - model_choices = gr.Radio(label="Model: ", value='resnet50', choices=SUPPORTING_MODEL_LIST) + model_choices = gr.Radio(label="Model: ", value='resnet', choices=SUPPORTING_MODEL_LIST) with gr.Row(equal_height=True): with gr.Column(scale=1): config_input = gr.Code(label="Augmentation configuration", value=args.config.read_text(), language='yaml', lines=30) diff --git a/pyproject.toml b/pyproject.toml index 303021ee..ca0b3de6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ extend-select = [ "I", "SIM", "INP001", + "W" ] ignore = [ @@ -19,6 +20,8 @@ extend-exclude = [ "docs/*.py", "src/netspresso_trainer/models/backbones/core", "src/netspresso_trainer/models/backbones/experimental", + "src/netspresso_trainer/models/necks/core", + "src/netspresso_trainer/models/necks/experimental", "src/netspresso_trainer/models/heads/classification", "src/netspresso_trainer/models/heads/detection", "src/netspresso_trainer/models/heads/segmentation", diff --git a/src/netspresso_trainer/VERSION b/src/netspresso_trainer/VERSION index 429d94ae..b0a12275 100644 --- a/src/netspresso_trainer/VERSION +++ b/src/netspresso_trainer/VERSION @@ -1 +1 @@ -0.0.9 \ No newline at end of file +0.0.10 \ No newline at end of file diff --git a/src/netspresso_trainer/__init__.py b/src/netspresso_trainer/__init__.py index 24b54002..0c4517b2 100644 --- a/src/netspresso_trainer/__init__.py +++ b/src/netspresso_trainer/__init__.py @@ -10,4 +10,4 @@ version = (Path(__file__).parent / "VERSION").read_text().strip() -__version__ = version \ No newline at end of file +__version__ = version diff --git a/src/netspresso_trainer/cfg/__init__.py b/src/netspresso_trainer/cfg/__init__.py index 892dcd41..4315dab4 100644 --- a/src/netspresso_trainer/cfg/__init__.py +++ b/src/netspresso_trainer/cfg/__init__.py @@ -8,6 +8,12 @@ ClassificationAugmentationConfig, ColorJitter, DetectionAugmentationConfig, + Pad, + RandomCrop, + RandomHorizontalFlip, + RandomResizedCrop, + RandomVerticalFlip, + Resize, SegmentationAugmentationConfig, ) from .data import ( @@ -32,15 +38,22 @@ from .logging import LoggingConfig from .model import ( ClassificationEfficientFormerModelConfig, + ClassificationMixNetLargeModelConfig, + ClassificationMixNetMediumModelConfig, + ClassificationMixNetSmallModelConfig, ClassificationMobileNetV3ModelConfig, ClassificationMobileViTModelConfig, ClassificationResNetModelConfig, ClassificationSegFormerModelConfig, ClassificationViTModelConfig, DetectionEfficientFormerModelConfig, + DetectionYoloXModelConfig, ModelConfig, PIDNetModelConfig, SegmentationEfficientFormerModelConfig, + SegmentationMixNetLargeModelConfig, + SegmentationMixNetMediumModelConfig, + SegmentationMixNetSmallModelConfig, SegmentationMobileNetV3ModelConfig, SegmentationResNetModelConfig, SegmentationSegFormerModelConfig, @@ -59,6 +72,7 @@ 'detection': DetectionScheduleConfig } + @dataclass class TrainerConfig: task: str = field(default=MISSING, metadata={"omegaconf_ignore": True}) @@ -69,19 +83,19 @@ class TrainerConfig: training: Optional[ScheduleConfig] = None logging: LoggingConfig = field(default_factory=lambda: LoggingConfig()) environment: EnvironmentConfig = field(default_factory=lambda: EnvironmentConfig()) - + @property def epochs(self) -> int: return self.training.epochs - + @property def batch_size(self) -> int: return self.training.batch_size - + @property def num_workers(self) -> int: return self.environment.num_workers - + @epochs.setter def epochs(self, v: int) -> None: self.training.epochs = v @@ -89,18 +103,18 @@ def epochs(self, v: int) -> None: @batch_size.setter def batch_size(self, v: int) -> None: self.training.batch_size = v - + @num_workers.setter def num_workers(self, v: int) -> None: self.environment.num_workers = v - + def __post_init__(self): assert self.task in ['classification', 'segmentation', 'detection'] self.data.task = self.task self.model.task = self.task - + if self.auto: if self.augmentation is None: self.augmentation = _AUGMENTATION_CONFIG_TYPE_DICT[self.task]() if self.training is None: - self.training = _TRAINING_CONFIG_TYPE_DICT[self.task]() \ No newline at end of file + self.training = _TRAINING_CONFIG_TYPE_DICT[self.task]() diff --git a/src/netspresso_trainer/cfg/augmentation.py b/src/netspresso_trainer/cfg/augmentation.py index 8e9378b7..1a0d5730 100644 --- a/src/netspresso_trainer/cfg/augmentation.py +++ b/src/netspresso_trainer/cfg/augmentation.py @@ -1,12 +1,29 @@ from dataclasses import dataclass, field from pathlib import Path -from typing import Optional, Union +from typing import List, Optional, Union from omegaconf import MISSING, MissingMandatoryValue +DEFAULT_IMG_SIZE = 256 + @dataclass -class ColorJitter: +class Transform: + name: str = MISSING + + +@dataclass +class AugmentationConfig: + img_size: int = DEFAULT_IMG_SIZE + transforms: List[Transform] = field(default_factory=lambda: [ + Transform() + ]) + mix_transforms: Optional[List[Transform]] = None + + +@dataclass +class ColorJitter(Transform): + name: str = 'colorjitter' brightness: Optional[float] = 0.25 contrast: Optional[float] = 0.25 saturation: Optional[float] = 0.25 @@ -15,56 +32,83 @@ class ColorJitter: @dataclass -class AugmentationConfig: - img_size: int = 256 - max_scale: Optional[int] = 1024 - min_scale: Optional[int] = None - crop_size_h: Optional[int] = None - crop_size_w: Optional[int] = None - resize_ratio0: Optional[float] = None - resize_ratiof: Optional[float] = None - resize_add: Optional[float] = 1 - fliplr: Optional[float] = 0.5 - color_jitter: Optional[ColorJitter] = field(default_factory=lambda: ColorJitter()) - - +class Pad(Transform): + name: str = 'pad' + padding: Union[int, List] = 0 + + +@dataclass +class RandomCrop(Transform): + name: str = 'randomcrop' + size: int = DEFAULT_IMG_SIZE + interpolation: Optional[str] = 'bilinear' + + +@dataclass +class RandomResizedCrop(Transform): + name: str = 'randomresizedcrop' + size: int = DEFAULT_IMG_SIZE + interpolation: Optional[str] = 'bilinear' + + +@dataclass +class RandomHorizontalFlip(Transform): + name: str = 'randomhorizontalflip' + p: float = 0.5 + + +@dataclass +class RandomVerticalFlip(Transform): + name: str = 'randomverticalflip' + p: float = 0.5 + + +@dataclass +class Resize(Transform): + name: str = 'resize' + size: int = DEFAULT_IMG_SIZE + interpolation: Optional[str] = 'bilinear' + + +@dataclass +class RandomMixup(Transform): + name: str = 'mixup' + alpha: float = 0.2 + p: float = 1.0 + + +@dataclass +class RandomCutmix(Transform): + name: str = 'cutmix' + alpha: float = 1.0 + p: float = 1.0 + @dataclass class ClassificationAugmentationConfig(AugmentationConfig): - resize_ratio0 = None - resize_ratiof = None - resize_add = None - color_jitter = None + img_size: int = 256 + transforms: List[Transform] = field(default_factory=lambda: [ + RandomResizedCrop(size=256), + RandomHorizontalFlip() + ]) + mix_transforms: List[Transform] = field(default_factory=lambda: [ + RandomCutmix(), + ]) @dataclass class SegmentationAugmentationConfig(AugmentationConfig): - img_size = 512 - resize_ratio0 = 1.0 - resize_ratiof = 1.5 - - def __post_init__(self): - # variable interpolation - if self.min_scale is None: - self.min_scale = self.img_size - if self.crop_size_h is None: - self.crop_size_h = self.img_size - if self.crop_size_w is None: - self.crop_size_w = self.img_size - + img_size: int = 512 + transforms: List[Transform] = field(default_factory=lambda: [ + RandomResizedCrop(size=512), + RandomHorizontalFlip(), + ColorJitter() + ]) + @dataclass class DetectionAugmentationConfig(AugmentationConfig): - img_size = 512 - max_scale = 2048 - min_scale = 768 - resize_ratio0: 0.5 - resize_ratiof: 2.0 - resize_add: 1 - - def __post_init__(self): - # variable interpolation - if self.crop_size_h is None: - self.crop_size_h = self.img_size - if self.crop_size_w is None: - self.crop_size_w = self.img_size \ No newline at end of file + img_size: int = 512 + transforms: List[Transform] = field(default_factory=lambda: [ + Resize(size=512) + ]) diff --git a/src/netspresso_trainer/cfg/data.py b/src/netspresso_trainer/cfg/data.py index 000624b1..21e2abc5 100644 --- a/src/netspresso_trainer/cfg/data.py +++ b/src/netspresso_trainer/cfg/data.py @@ -262,4 +262,4 @@ class HuggingFaceSegmentationDatasetConfig(DatasetConfig): subset="full", features={"image": "image", "label": "artist"} ) -) \ No newline at end of file +) diff --git a/src/netspresso_trainer/cfg/model.py b/src/netspresso_trainer/cfg/model.py index c9d47604..71184e1c 100644 --- a/src/netspresso_trainer/cfg/model.py +++ b/src/netspresso_trainer/cfg/model.py @@ -18,20 +18,31 @@ "ClassificationSegFormerModelConfig", "SegmentationSegFormerModelConfig", "ClassificationViTModelConfig", + "DetectionYoloXModelConfig", + "ClassificationMixNetSmallModelConfig", + "ClassificationMixNetMediumModelConfig", + "ClassificationMixNetLargeModelConfig", + "SegmentationMixNetSmallModelConfig", + "SegmentationMixNetMediumModelConfig", + "SegmentationMixNetLargeModelConfig", ] + @dataclass class ArchitectureConfig: full: Optional[Dict[str, Any]] = None backbone: Optional[Dict[str, Any]] = None + neck: Optional[Dict[str, Any]] = None head: Optional[Dict[str, Any]] = None - + def __post_init__(self): assert bool(self.full) != bool(self.backbone), "Only one of full or backbone should be given." - + + @dataclass class ModelConfig: task: str = MISSING + name: str = MISSING checkpoint: Optional[Union[Path, str]] = None fx_model_checkpoint: Optional[Union[Path, str]] = None resume_optimizer_checkpoint: Optional[Union[Path, str]] = None @@ -44,56 +55,81 @@ class ModelConfig: class EfficientFormerArchitectureConfig(ArchitectureConfig): backbone: Dict[str, Any] = field(default_factory=lambda: { "name": "efficientformer", - "num_blocks": [3, 2, 6, 4], - "hidden_sizes": [48, 96, 224, 448], - "num_attention_heads": 8, - "attention_hidden_size": 256, # attention_hidden_size_splitted * num_attention_heads - "attention_dropout_prob": 0., - "attention_ratio": 4, - "attention_bias_resolution": 16, - "pool_size": 3, - "intermediate_ratio": 4, - "hidden_dropout_prob": 0., - "hidden_activation_type": 'gelu', - "layer_norm_eps": 1e-5, - "drop_path_rate": 0., - "use_layer_scale": True, - "layer_scale_init_value": 1e-5, - "downsamples": [True, True, True, True], - "down_patch_size": 3, - "down_stride": 2, - "down_pad": 1, - "vit_num": 1, + "params": { + "num_attention_heads": 8, + "attention_hidden_size": 256, + "attention_dropout_prob": 0., + "attention_ratio": 4, + "attention_bias_resolution": 16, + "pool_size": 3, + "intermediate_ratio": 4, + "hidden_dropout_prob": 0., + "hidden_activation_type": 'gelu', + "layer_norm_eps": 1e-5, + "drop_path_rate": 0., + "use_layer_scale": True, + "layer_scale_init_value": 1e-5, + "down_patch_size": 3, + "down_stride": 2, + "down_pad": 1, + "vit_num": 1, + }, + "stage_params": [ + {"num_blocks": 3, "hidden_sizes": 48, "downsamples": True}, + {"num_blocks": 2, "hidden_sizes": 96, "downsamples": True}, + {"num_blocks": 6, "hidden_sizes": 224, "downsamples": True}, + {"num_blocks": 4, "hidden_sizes": 448, "downsamples": True}, + ], }) @dataclass class MobileNetV3ArchitectureConfig(ArchitectureConfig): backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "mobilenetv3_small", - - # [in_channels, kernel, expended_channels, out_channels, use_se, activation, stride, dilation] - "block_info": [ - [ - [16, 3, 16, 16, True, "relu", 2, 1] - ], - [ - [16, 3, 72, 24, False, "relu", 2, 1], - [24, 3, 88, 24, False, "relu", 1, 1] - ], - [ - [24, 5, 96, 40, True, "hard_swish", 2, 1], - [40, 5, 240, 40, True, "hard_swish", 1, 1], - [40, 5, 240, 40, True, "hard_swish", 1, 1], - [40, 5, 120, 48, True, "hard_swish", 1, 1], - [48, 5, 144, 48, True, "hard_swish", 1, 1] - ], - [ - [48, 5, 288, 96, True, "hard_swish", 2, 1], - [96, 5, 576, 96, True, "hard_swish", 1, 1], - [96, 5, 576, 96, True, "hard_swish", 1, 1] - ] - ] + "name": "mobilenetv3", + "params": None, + "stage_params": [ + { + "in_channels": [16], + "kernel": [3], + "expanded_channels": [16], + "out_channels": [16], + "use_se": [True], + "activation": ["relu"], + "stride": [2], + "dilation": [1], + }, + { + "in_channels": [16, 24], + "kernel": [3, 3], + "expanded_channels": [72, 88], + "out_channels": [24, 24], + "use_se": [False, False], + "activation": ["relu", "relu"], + "stride": [2, 1], + "dilation": [1, 1], + }, + { + "in_channels": [24, 40, 40, 40, 48], + "kernel": [5, 5, 5, 5, 5], + "expanded_channels": [96, 240, 240, 120, 144], + "out_channels": [40, 40, 40, 48, 48], + "use_se": [True, True, True, True, True], + "activation": ["hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish"], + "stride": [2, 1, 1, 1, 1], + "dilation": [1, 1, 1, 1, 1], + }, + { + "in_channels": [48, 96, 96], + "kernel": [5, 5, 5], + "expanded_channels": [288, 576, 576], + "out_channels": [96, 96, 96], + "use_se": [True, True, True], + "activation": ["hard_swish", "hard_swish", "hard_swish"], + "stride": [2, 1, 1], + "dilation": [1, 1, 1], + }, + ], }) @@ -101,24 +137,74 @@ class MobileNetV3ArchitectureConfig(ArchitectureConfig): class MobileViTArchitectureConfig(ArchitectureConfig): backbone: Dict[str, Any] = field(default_factory=lambda: { "name": "mobilevit", - "out_channels": [32, 64, 96, 128, 160], - "block_type": ['mv2', 'mv2', 'mobilevit', 'mobilevit', 'mobilevit'], - "num_blocks": [1, 3, None, None, None], - "stride": [1, 2, 2, 2, 2], - "hidden_size": [None, None, 144, 192, 240], - "intermediate_size": [None, None, 288, 384, 480], - "num_transformer_blocks": [None, None, 2, 4, 3], - "dilate": [None, None, False, False, False], - "expand_ratio": [4, 4, 4, 4, 4], # [mv2_exp_mult] * 4 - "patch_embedding_out_channels": 16, - "local_kernel_size": 3, - "patch_size": 2, - "num_attention_heads": 4, # num_heads - "attention_dropout_prob": 0.1, - "hidden_dropout_prob": 0.0, - "exp_factor": 4, - "layer_norm_eps": 1e-5, - "use_fusion_layer": True, + "params": { + "patch_embedding_out_channels": 16, + "local_kernel_size": 3, + "patch_size": 2, + "num_attention_heads": 4, + "attention_dropout_prob": 0.1, + "hidden_dropout_prob": 0.0, + "exp_factor": 4, + "layer_norm_eps": 1e-5, + "use_fusion_layer": True, + }, + "stage_params": [ + { + "out_channels": 32, + "block_type": "mv2", + "num_blocks": 1, + "stride": 1, + "hidden_size": None, + "intermediate_size": None, + "num_transformer_blocks": None, + "dilate": None, + "expand_ratio": 4, + }, + { + "out_channels": 64, + "block_type": "mv2", + "num_blocks": 3, + "stride": 2, + "hidden_size": None, + "intermediate_size": None, + "num_transformer_blocks": None, + "dilate": None, + "expand_ratio": 4, + }, + { + "out_channels": 96, + "block_type": "mobilevit", + "num_blocks": None, + "stride": 2, + "hidden_size": 144, + "intermediate_size": 288, + "num_transformer_blocks": 2, + "dilate": False, + "expand_ratio": 4, + }, + { + "out_channels": 128, + "block_type": "mobilevit", + "num_blocks": None, + "stride": 2, + "hidden_size": 192, + "intermediate_size": 384, + "num_transformer_blocks": 4, + "dilate": False, + "expand_ratio": 4, + }, + { + "out_channels": 160, + "block_type": "mobilevit", + "num_blocks": None, + "stride": 2, + "hidden_size": 240, + "intermediate_size": 480, + "num_transformer_blocks": 3, + "dilate": False, + "expand_ratio": 4, + }, + ] }) @@ -137,9 +223,21 @@ class PIDNetArchitectureConfig(ArchitectureConfig): @dataclass class ResNetArchitectureConfig(ArchitectureConfig): backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "resnet50", - "block": "bottleneck", - "layers": [3, 4, 6, 3], + "name": "resnet", + "params": { + "block": "bottleneck", + "norm_layer": "batch_norm", + "groups": 1, + "width_per_group": 64, + "zero_init_residual": False, + "expansion": None, + }, + "stage_params": [ + {"plane": 64, "layers": 3}, + {"plane": 128, "layers": 4}, + {"plane": 256, "layers": 6}, + {"plane": 512, "layers": 3}, + ], }) @@ -147,18 +245,47 @@ class ResNetArchitectureConfig(ArchitectureConfig): class SegFormerArchitectureConfig(ArchitectureConfig): backbone: Dict[str, Any] = field(default_factory=lambda: { "name": "segformer", - "num_modules": 4, - "num_blocks": [2, 2, 2, 2], - "sr_ratios": [8, 4, 2, 1], - "hidden_sizes": [32, 64, 160, 256], - "embedding_patch_sizes": [7, 3, 3, 3], - "embedding_strides": [4, 2, 2, 2], - "num_attention_heads": [1, 2, 5, 8], - "intermediate_ratio": 4, - "hidden_activation_type": "gelu", - "hidden_dropout_prob": 0.0, - "attention_dropout_prob": 0.0, - "layer_norm_eps": 1e-5, + "params": { + "intermediate_ratio": 4, + "hidden_activation_type": "gelu", + "hidden_dropout_prob": 0.0, + "attention_dropout_prob": 0.0, + "layer_norm_eps": 1e-5, + }, + "stage_params": [ + { + "num_blocks": 2, + "sr_ratios": 8, + "hidden_sizes": 32, + "embedding_patch_sizes": 7, + "embedding_strides": 4, + "num_attention_heads": 1, + }, + { + "num_blocks": 2, + "sr_ratios": 4, + "hidden_sizes": 64, + "embedding_patch_sizes": 3, + "embedding_strides": 2, + "num_attention_heads": 2, + }, + { + "num_blocks": 2, + "sr_ratios": 2, + "hidden_sizes": 160, + "embedding_patch_sizes": 3, + "embedding_strides": 2, + "num_attention_heads": 5, + }, + { + "num_blocks": 2, + "sr_ratios": 1, + "hidden_sizes": 256, + "embedding_patch_sizes": 3, + "embedding_strides": 2, + "num_attention_heads": 8, + }, + ], }) @@ -166,31 +293,241 @@ class SegFormerArchitectureConfig(ArchitectureConfig): class ViTArchitectureConfig(ArchitectureConfig): backbone: Dict[str, Any] = field(default_factory=lambda: { "name": "vit", - "patch_size": 16, - "hidden_size": 192, - "num_blocks": 12, - "num_attention_heads": 3, - "attention_dropout_prob": 0.0, - "intermediate_size": 192 * 4, - "hidden_dropout_prob": 0.1, + "params": { + "patch_size": 16, + "hidden_size": 192, + "num_blocks": 12, + "num_attention_heads": 3, + "attention_dropout_prob": 0.0, + "intermediate_size": 768, + "hidden_dropout_prob": 0.1, + "layer_norm_eps": 1e-6, + "use_cls_token": True, + "vocab_size": 1000, + }, + "stage_params": None, + }) + + +@dataclass +class MixNetSmallArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field(default_factory=lambda: { + "name": "mixnet", + "params": { + "stem_planes": 16, + "width_multi": 1.0, + "depth_multi": 1.0, + "dropout_rate": 0., + }, + "stage_params": [ + { + "expand_ratio": [1, 6, 3], + "out_channels": [16, 24, 24], + "num_blocks": [1, 1, 1], + "kernel_sizes": [[3], [3], [3]], + "exp_kernel_sizes": [[1], [1, 1], [1, 1]], + "poi_kernel_sizes": [[1], [1, 1], [1, 1]], + "stride": [1, 2, 1], + "dilation": [1, 1, 1], + "act_type": ["relu", "relu", "relu"], + "se_reduction_ratio": [None, None, None], + }, + { + "expand_ratio": [6, 6], + "out_channels": [40, 40], + "num_blocks": [1, 3], + "kernel_sizes": [[3, 5, 7], [3, 5]], + "exp_kernel_sizes": [[1], [1, 1]], + "poi_kernel_sizes": [[1], [1, 1]], + "stride": [2, 1], + "dilation": [1, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + { + "expand_ratio": [6, 6, 6, 3], + "out_channels": [80, 80, 120, 120], + "num_blocks": [1, 2, 1, 2], + "kernel_sizes": [[3, 5, 7], [3, 5], [3, 5, 7], [3, 5, 7, 9]], + "exp_kernel_sizes": [[1], [1], [1, 1], [1, 1]], + "poi_kernel_sizes": [[1, 1], [1, 1], [1, 1], [1, 1]], + "stride": [2, 1, 1, 1], + "dilation": [1, 1, 1, 1], + "act_type": ["swish", "swish", "swish", "swish"], + "se_reduction_ratio": [4, 4, 2, 2], + }, + { + "expand_ratio": [6, 6], + "out_channels": [200, 200], + "num_blocks": [1, 2], + "kernel_sizes": [[3, 5, 7, 9, 11], [3, 5, 7, 9]], + "exp_kernel_sizes": [[1], [1]], + "poi_kernel_sizes": [[1], [1, 1]], + "stride": [2, 1], + "dilation": [1, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + ], + }) + + +@dataclass +class MixNetMediumArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field(default_factory=lambda: { + "name": "mixnet", + "params": { + "stem_planes": 24, + "width_multi": 1.0, + "depth_multi": 1.0, + "dropout_rate": 0., + }, + "stage_params": [ + { + "expand_ratio": [1, 6, 3], + "out_channels": [24, 32, 32], + "num_blocks": [1, 1, 1], + "kernel_sizes": [[3], [3, 5, 7], [3]], + "exp_kernel_sizes": [[1], [1, 1], [1, 1]], + "poi_kernel_sizes": [[1], [1, 1], [1, 1]], + "stride": [1, 2, 1], + "dilation": [1, 1, 1], + "act_type": ["relu", "relu", "relu"], + "se_reduction_ratio": [None, None, None], + }, + { + "expand_ratio": [6, 6], + "out_channels": [40, 40], + "num_blocks": [1, 3], + "kernel_sizes": [[3, 5, 7, 9], [3, 5]], + "exp_kernel_sizes": [[1], [1, 1]], + "poi_kernel_sizes": [[1], [1, 1]], + "stride": [2, 1], + "dilation": [1, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + { + "expand_ratio": [6, 6, 6, 3], + "out_channels": [80, 80, 120, 120], + "num_blocks": [1, 3, 1, 3], + "kernel_sizes": [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]], + "exp_kernel_sizes": [[1], [1, 1], [1], [1, 1]], + "poi_kernel_sizes": [[1], [1, 1], [1], [1, 1]], + "stride": [2, 1, 1, 1], + "dilation": [1, 1, 1, 1], + "act_type": ["swish", "swish", "swish", "swish"], + "se_reduction_ratio": [4, 4, 2, 2], + }, + { + "expand_ratio": [6, 6], + "out_channels": [200, 200], + "num_blocks": [1, 3], + "kernel_sizes": [[3, 5, 7, 9], [3, 5, 7, 9]], + "exp_kernel_sizes": [[1], [1]], + "poi_kernel_sizes": [[1], [1, 1]], + "stride": [2, 1], + "dilation": [1, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + ], + }) + + +@dataclass +class MixNetLargeArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field(default_factory=lambda: { + "name": "mixnet", + "params": { + "stem_planes": 24, + "width_multi": 1.3, + "depth_multi": 1.0, + "dropout_rate": 0., + }, + "stage_params": [ + { + "expand_ratio": [1, 6, 3], + "out_channels": [24, 32, 32], + "num_blocks": [1, 1, 1], + "kernel_sizes": [[3], [3, 5, 7], [3]], + "exp_kernel_sizes": [[1], [1, 1], [1, 1]], + "poi_kernel_sizes": [[1], [1, 1], [1, 1]], + "stride": [1, 2, 1], + "dilation": [1, 1, 1], + "act_type": ["relu", "relu", "relu"], + "se_reduction_ratio": [None, None, None], + }, + { + "expand_ratio": [6, 6], + "out_channels": [40, 40], + "num_blocks": [1, 3], + "kernel_sizes": [[3, 5, 7, 9], [3, 5]], + "exp_kernel_sizes": [[1], [1, 1]], + "poi_kernel_sizes": [[1], [1, 1]], + "stride": [2, 1], + "dilation": [1, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + { + "expand_ratio": [6, 6, 6, 3], + "out_channels": [80, 80, 120, 120], + "num_blocks": [1, 3, 1, 3], + "kernel_sizes": [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]], + "exp_kernel_sizes": [[1], [1, 1], [1], [1, 1]], + "poi_kernel_sizes": [[1], [1, 1], [1], [1, 1]], + "stride": [2, 1, 1, 1], + "dilation": [1, 1, 1, 1], + "act_type": ["swish", "swish", "swish", "swish"], + "se_reduction_ratio": [4, 4, 2, 2], + }, + { + "expand_ratio": [6, 6], + "out_channels": [200, 200], + "num_blocks": [1, 3], + "kernel_sizes": [[3, 5, 7, 9], [3, 5, 7, 9]], + "exp_kernel_sizes": [[1], [1]], + "poi_kernel_sizes": [[1], [1, 1]], + "stride": [2, 1], + "dilation": [1, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + ], + }) + + +@dataclass +class CSPDarkNetSmallArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field(default_factory=lambda: { + "name": "cspdarknet", + "params": { + "dep_mul": 0.33, + "wid_mul": 0.5, + "act_type": "silu", + }, + "stage_params": None, }) @dataclass class ClassificationEfficientFormerModelConfig(ModelConfig): task: str = "classification" + name: str = "efficientformer_l1" checkpoint: Optional[Union[Path, str]] = "./weights/efficientformer/efficientformer_l1_1000d.pth" architecture: ArchitectureConfig = field(default_factory=lambda: EfficientFormerArchitectureConfig( head={"name": "fc"} )) losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "label_smoothing_cross_entropy", "smoothing": 0.1, "weight": None} + {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} ]) @dataclass class SegmentationEfficientFormerModelConfig(ModelConfig): task: str = "segmentation" + name: str = "efficientformer_l1" checkpoint: Optional[Union[Path, str]] = "./weights/efficientformer/efficientformer_l1_1000d.pth" architecture: ArchitectureConfig = field(default_factory=lambda: EfficientFormerArchitectureConfig( head={"name": "all_mlp_decoder"} @@ -203,8 +540,10 @@ class SegmentationEfficientFormerModelConfig(ModelConfig): @dataclass class DetectionEfficientFormerModelConfig(ModelConfig): task: str = "detection" + name: str = "efficientformer_l1" checkpoint: Optional[Union[Path, str]] = "./weights/efficientformer/efficientformer_l1_1000d.pth" architecture: ArchitectureConfig = field(default_factory=lambda: EfficientFormerArchitectureConfig( + neck={"name": "fpn"}, head={"name": "faster_rcnn"} )) losses: List[Dict[str, Any]] = field(default_factory=lambda: [ @@ -216,18 +555,20 @@ class DetectionEfficientFormerModelConfig(ModelConfig): @dataclass class ClassificationMobileNetV3ModelConfig(ModelConfig): task: str = "classification" + name: str = "mobilenet_v3_small" checkpoint: Optional[Union[Path, str]] = "./weights/mobilenetv3/mobilenet_v3_small.pth" architecture: ArchitectureConfig = field(default_factory=lambda: MobileNetV3ArchitectureConfig( head={"name": "fc"} )) losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "label_smoothing_cross_entropy", "smoothing": 0.1, "weight": None} + {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} ]) @dataclass class SegmentationMobileNetV3ModelConfig(ModelConfig): task: str = "segmentation" + name: str = "mobilenet_v3_small" checkpoint: Optional[Union[Path, str]] = "./weights/mobilenetv3/mobilenet_v3_small.pth" architecture: ArchitectureConfig = field(default_factory=lambda: MobileNetV3ArchitectureConfig( head={"name": "all_mlp_decoder"} @@ -240,18 +581,20 @@ class SegmentationMobileNetV3ModelConfig(ModelConfig): @dataclass class ClassificationMobileViTModelConfig(ModelConfig): task: str = "classification" + name: str = "mobilevit_s" checkpoint: Optional[Union[Path, str]] = "./weights/mobilevit/mobilevit_s.pth" architecture: ArchitectureConfig = field(default_factory=lambda: MobileViTArchitectureConfig( head={"name": "fc"} )) losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "label_smoothing_cross_entropy", "smoothing": 0.1, "weight": None} + {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} ]) @dataclass class PIDNetModelConfig(ModelConfig): - task: str = "classification" + task: str = "segmentation" + name: str = "pidnet_s" checkpoint: Optional[Union[Path, str]] = "./weights/pidnet/pidnet_s.pth" architecture: ArchitectureConfig = field(default_factory=lambda: PIDNetArchitectureConfig()) losses: List[Dict[str, Any]] = field(default_factory=lambda: [ @@ -264,18 +607,20 @@ class PIDNetModelConfig(ModelConfig): @dataclass class ClassificationResNetModelConfig(ModelConfig): task: str = "classification" + name: str = "resnet50" checkpoint: Optional[Union[Path, str]] = "./weights/resnet/resnet50.pth" architecture: ArchitectureConfig = field(default_factory=lambda: ResNetArchitectureConfig( head={"name": "fc"} )) losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "label_smoothing_cross_entropy", "smoothing": 0.1, "weight": None} + {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} ]) @dataclass class SegmentationResNetModelConfig(ModelConfig): task: str = "segmentation" + name: str = "resnet50" checkpoint: Optional[Union[Path, str]] = "./weights/resnet/resnet50.pth" architecture: ArchitectureConfig = field(default_factory=lambda: ResNetArchitectureConfig( head={"name": "all_mlp_decoder"} @@ -288,18 +633,20 @@ class SegmentationResNetModelConfig(ModelConfig): @dataclass class ClassificationSegFormerModelConfig(ModelConfig): task: str = "classification" + name: str = "segformer" checkpoint: Optional[Union[Path, str]] = "./weights/segformer/segformer.pth" architecture: ArchitectureConfig = field(default_factory=lambda: SegFormerArchitectureConfig( head={"name": "fc"} )) losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "label_smoothing_cross_entropy", "smoothing": 0.1, "weight": None} + {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} ]) @dataclass class SegmentationSegFormerModelConfig(ModelConfig): task: str = "segmentation" + name: str = "segformer" checkpoint: Optional[Union[Path, str]] = "./weights/segformer/segformer.pth" architecture: ArchitectureConfig = field(default_factory=lambda: SegFormerArchitectureConfig( head={"name": "all_mlp_decoder"} @@ -312,11 +659,103 @@ class SegmentationSegFormerModelConfig(ModelConfig): @dataclass class ClassificationViTModelConfig(ModelConfig): task: str = "classification" + name: str = "vit_tiny" checkpoint: Optional[Union[Path, str]] = "./weights/vit/vit-tiny.pth" architecture: ArchitectureConfig = field(default_factory=lambda: ViTArchitectureConfig( head={"name": "fc"} )) losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "label_smoothing_cross_entropy", "smoothing": 0.1, "weight": None} + {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} + ]) + + +@dataclass +class DetectionYoloXModelConfig(ModelConfig): + task: str = "detection" + name: str = "yolox_s" + checkpoint: Optional[Union[Path, str]] = "./weights/yolox/yolox_s.pth" + architecture: ArchitectureConfig = field(default_factory=lambda: CSPDarkNetSmallArchitectureConfig( + neck={"name": "pafpn"}, + head={"name": "yolox_head"} + )) + losses: List[Dict[str, Any]] = field(default_factory=lambda: [ + {"criterion": "yolox_loss", "weight": None} + ]) + + +@dataclass +class ClassificationMixNetSmallModelConfig(ModelConfig): + task: str = "classification" + name: str = "mixnet_s" + checkpoint: Optional[Union[Path, str]] = "./weights/mixnet/mixnet_s.pth" + architecture: ArchitectureConfig = field(default_factory=lambda: MixNetSmallArchitectureConfig( + head={"name": "fc"} + )) + losses: List[Dict[str, Any]] = field(default_factory=lambda: [ + {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} ]) + +@dataclass +class SegmentationMixNetSmallModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "mixnet_s" + checkpoint: Optional[Union[Path, str]] = "./weights/mixnet/mixnet_s.pth" + architecture: ArchitectureConfig = field(default_factory=lambda: MixNetSmallArchitectureConfig( + head={"name": "all_mlp_decoder"} + )) + losses: List[Dict[str, Any]] = field(default_factory=lambda: [ + {"criterion": "cross_entropy", "ignore_index": 255, "weight": None} + ]) + + +@dataclass +class ClassificationMixNetMediumModelConfig(ModelConfig): + task: str = "classification" + name: str = "mixnet_m" + checkpoint: Optional[Union[Path, str]] = "./weights/mixnet/mixnet_m.pth" + architecture: ArchitectureConfig = field(default_factory=lambda: MixNetMediumArchitectureConfig( + head={"name": "fc"} + )) + losses: List[Dict[str, Any]] = field(default_factory=lambda: [ + {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} + ]) + + +@dataclass +class SegmentationMixNetMediumModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "mixnet_m" + checkpoint: Optional[Union[Path, str]] = "./weights/mixnet/mixnet_m.pth" + architecture: ArchitectureConfig = field(default_factory=lambda: MixNetMediumArchitectureConfig( + head={"name": "all_mlp_decoder"} + )) + losses: List[Dict[str, Any]] = field(default_factory=lambda: [ + {"criterion": "cross_entropy", "ignore_index": 255, "weight": None} + ]) + + +@dataclass +class ClassificationMixNetLargeModelConfig(ModelConfig): + task: str = "classification" + name: str = "mixnet_l" + checkpoint: Optional[Union[Path, str]] = "./weights/mixnet/mixnet_l.pth" + architecture: ArchitectureConfig = field(default_factory=lambda: MixNetLargeArchitectureConfig( + head={"name": "fc"} + )) + losses: List[Dict[str, Any]] = field(default_factory=lambda: [ + {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} + ]) + + +@dataclass +class SegmentationMixNetLargeModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "mixnet_l" + checkpoint: Optional[Union[Path, str]] = "./weights/mixnet/mixnet_l.pth" + architecture: ArchitectureConfig = field(default_factory=lambda: MixNetLargeArchitectureConfig( + head={"name": "all_mlp_decoder"} + )) + losses: List[Dict[str, Any]] = field(default_factory=lambda: [ + {"criterion": "cross_entropy", "ignore_index": 255, "weight": None} + ]) diff --git a/src/netspresso_trainer/cfg/training.py b/src/netspresso_trainer/cfg/training.py index a13be88f..3c0c32ff 100644 --- a/src/netspresso_trainer/cfg/training.py +++ b/src/netspresso_trainer/cfg/training.py @@ -32,4 +32,4 @@ class SegmentationScheduleConfig(ScheduleConfig): @dataclass class DetectionScheduleConfig(ScheduleConfig): - pass \ No newline at end of file + pass diff --git a/src/netspresso_trainer/dataloaders/augmentation/__init__.py b/src/netspresso_trainer/dataloaders/augmentation/__init__.py index 0c893e74..624fcd12 100644 --- a/src/netspresso_trainer/dataloaders/augmentation/__init__.py +++ b/src/netspresso_trainer/dataloaders/augmentation/__init__.py @@ -12,3 +12,4 @@ Resize, ToTensor, ) +from .registry import TRANSFORM_DICT diff --git a/src/netspresso_trainer/dataloaders/augmentation/custom.py b/src/netspresso_trainer/dataloaders/augmentation/custom.py index 720e5b0c..0895bd3b 100644 --- a/src/netspresso_trainer/dataloaders/augmentation/custom.py +++ b/src/netspresso_trainer/dataloaders/augmentation/custom.py @@ -1,15 +1,27 @@ +import math import random from collections.abc import Sequence -from typing import Dict, Optional +from typing import Dict, List, Optional, Tuple import numpy as np import PIL.Image as Image import torch import torchvision.transforms as T import torchvision.transforms.functional as F +from omegaconf import ListConfig +from torch import Tensor +from torch.nn import functional as F_torch +from torchvision.transforms.autoaugment import _apply_op +from torchvision.transforms.functional import InterpolationMode BBOX_CROP_KEEP_THRESHOLD = 0.2 MAX_RETRY = 5 +INVERSE_MODES_MAPPING = { + 'nearest': InterpolationMode.NEAREST, + 'bilinear': InterpolationMode.BILINEAR, + 'bicubic': InterpolationMode.BICUBIC, +} + class Compose: def __init__(self, transforms, additional_targets: Dict = None): @@ -22,7 +34,7 @@ def _get_transformed(self, image, mask, bbox, visualize_for_debug): for t in self.transforms: if visualize_for_debug and not t.visualize: continue - image, mask, bbox = t(image=image, mask=mask, bbox=bbox) + image, mask, bbox = t(image=image, mask=mask, bbox=bbox) return image, mask, bbox def __call__(self, image, mask=None, bbox=None, visualize_for_debug=False, **kwargs): @@ -91,6 +103,18 @@ def __repr__(self): class Resize(T.Resize): visualize = True + def __init__(self, size, interpolation='bilinear', max_size=None, antialias=None): + interpolation = INVERSE_MODES_MAPPING[interpolation] + + # TODO: There is logic error in forward. If `size` is int, this specify edge for shorter one. + # And, this is not match with bbox computing logic. + # Thus, automatically transform to sequence format for now, + # but this should be specified whether Resize receives sequence or int. + if isinstance(size, int): + size = [size, size] + + super().__init__(size, interpolation, max_size, antialias) + def forward(self, image, mask=None, bbox=None): w, h = image.size @@ -275,6 +299,15 @@ def __repr__(self): class RandomResizedCrop(T.RandomResizedCrop): visualize = True + def __init__(self, + size, + scale=(0.08, 1.0), + ratio=(3.0 / 4.0, 4.0 / 3.0), + interpolation='bilinear', + antialias: Optional[bool]=None): + interpolation = INVERSE_MODES_MAPPING[interpolation] + super().__init__(size, scale, ratio, interpolation, antialias) + def _crop_bbox(self, bbox, i, j, h, w): area_original = (bbox[..., 2] - bbox[..., 0]) * (bbox[..., 3] - bbox[..., 1]) @@ -321,6 +354,300 @@ def __repr__(self): return format_string +class RandomErasing(T.RandomErasing): + visualize = True + + def __init__(self, p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace=False): + if isinstance(scale, ListConfig): + scale = tuple(scale) + if isinstance(ratio, ListConfig): + ratio = tuple(ratio) + if isinstance(value, ListConfig): + value = tuple(value) + super().__init__(p, scale, ratio, value, inplace) + + @staticmethod + def get_params( + img, scale: Tuple[float, float], ratio: Tuple[float, float], value: Optional[int] = None + ): + img_w, img_h = img.size + + area = img_h * img_w + + log_ratio = torch.log(torch.tensor(ratio)) + for _ in range(10): + erase_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item() + aspect_ratio = torch.exp(torch.empty(1).uniform_(log_ratio[0], log_ratio[1])).item() + + h = int(round(math.sqrt(erase_area * aspect_ratio))) + w = int(round(math.sqrt(erase_area / aspect_ratio))) + if not (h < img_h and w < img_w): + continue + + if value is None: + v = np.random.randint(255, size=(h, w)).astype('uint8') + v = Image.fromarray(v).convert(img.mode) + else: + v = Image.new(img.mode, (w, h), value) + + i = torch.randint(0, img_h - h + 1, size=(1,)).item() + j = torch.randint(0, img_w - w + 1, size=(1,)).item() + return i, j, v + + # Return original image + return 0, 0, img + + def forward(self, image, mask=None, bbox=None): + if torch.rand(1) < self.p: + x, y, v = self.get_params(image, scale=self.scale, ratio=self.ratio, value=self.value) + image.paste(v, (y, x)) + # TODO: Object-aware + return image, mask, bbox + return image, mask, bbox + + +class TrivialAugmentWide(torch.nn.Module): + """ + Based on the torchvision implementation. + https://pytorch.org/vision/main/_modules/torchvision/transforms/autoaugment.html#TrivialAugmentWide + """ + visualize = True + + def __init__( + self, + num_magnitude_bins: int = 31, + interpolation: InterpolationMode = 'bilinear', + fill: Optional[List[float]] = None, + ) -> None: + super().__init__() + interpolation = INVERSE_MODES_MAPPING[interpolation] + + self.num_magnitude_bins = num_magnitude_bins + self.interpolation = interpolation + self.fill = fill + + def _augmentation_space(self, num_bins: int) -> Dict[str, Tuple[Tensor, bool]]: + return { + # op_name: (magnitudes, signed) + "Identity": (torch.tensor(0.0), False), + "ShearX": (torch.linspace(0.0, 0.99, num_bins), True), + "ShearY": (torch.linspace(0.0, 0.99, num_bins), True), + "TranslateX": (torch.linspace(0.0, 32.0, num_bins), True), + "TranslateY": (torch.linspace(0.0, 32.0, num_bins), True), + "Rotate": (torch.linspace(0.0, 135.0, num_bins), True), + "Brightness": (torch.linspace(0.0, 0.99, num_bins), True), + "Color": (torch.linspace(0.0, 0.99, num_bins), True), + "Contrast": (torch.linspace(0.0, 0.99, num_bins), True), + "Sharpness": (torch.linspace(0.0, 0.99, num_bins), True), + "Posterize": (8 - (torch.arange(num_bins) / ((num_bins - 1) / 6)).round().int(), False), + "Solarize": (torch.linspace(255.0, 0.0, num_bins), False), + "AutoContrast": (torch.tensor(0.0), False), + "Equalize": (torch.tensor(0.0), False), + } + + def forward(self, image, mask=None, bbox=None): + fill = self.fill + channels, height, width = F.get_dimensions(image) + if isinstance(image, Tensor): + if isinstance(fill, (int, float)): + fill = [float(fill)] * channels + elif fill is not None: + fill = [float(f) for f in fill] + + op_meta = self._augmentation_space(self.num_magnitude_bins) + op_index = int(torch.randint(len(op_meta), (1,)).item()) + op_name = list(op_meta.keys())[op_index] + magnitudes, signed = op_meta[op_name] + magnitude = ( + float(magnitudes[torch.randint(len(magnitudes), (1,), dtype=torch.long)].item()) + if magnitudes.ndim > 0 + else 0.0 + ) + if signed and torch.randint(2, (1,)): + magnitude *= -1.0 + + # TODO: Compute mask, bbox + return _apply_op(image, op_name, magnitude, interpolation=self.interpolation, fill=fill), mask, bbox + + def __repr__(self) -> str: + s = ( + f"{self.__class__.__name__}(" + f"num_magnitude_bins={self.num_magnitude_bins}" + f", interpolation={self.interpolation}" + f", fill={self.fill}" + f")" + ) + return s + + +class RandomMixup: + """ + Based on the RandomMixup implementation of ml_cvnets. + https://github.com/apple/ml-cvnets/blob/77717569ab4a852614dae01f010b32b820cb33bb/data/transforms/image_torch.py + + Given a batch of input images and labels, this class randomly applies the + `MixUp transformation `_ + + Args: + opts (argparse.Namespace): Arguments + num_classes (int): Number of classes in the dataset + """ + visualize = False + + def __init__(self, num_classes: int, alpha, p=1.0, inplace=False): + if not (num_classes > 0): + raise ValueError("Please provide a valid positive value for the num_classes.") + if not (alpha > 0): + raise ValueError("Alpha param can't be zero.") + if not (0.0 < p <= 1.0): + raise ValueError("MixUp probability should be between 0 and 1, where 1 is inclusive") + + self.num_classes = num_classes + self.alpha = alpha + self.p = p + self.inplace = inplace + + def _apply_mixup_transform(self, image_tensor, target_tensor): + if image_tensor.ndim != 4: + raise ValueError(f"Batch ndim should be 4. Got {image_tensor.ndim}") + if target_tensor.ndim != 1: + raise ValueError(f"Target ndim should be 1. Got {target_tensor.ndim}") + if not image_tensor.is_floating_point(): + raise ValueError(f"Batch datatype should be a float tensor. Got {image_tensor.dtype}.") + if target_tensor.dtype != torch.int64: + raise ValueError(f"Target datatype should be torch.int64. Got {target_tensor.dtype}") + + if not self.inplace: + image_tensor = image_tensor.clone() + target_tensor = target_tensor.clone() + + if target_tensor.ndim == 1: + target_tensor = F_torch.one_hot( + target_tensor, num_classes=self.num_classes + ).to(dtype=image_tensor.dtype) + + # It's faster to roll the batch by one instead of shuffling it to create image pairs + batch_rolled = image_tensor.roll(1, 0) + target_rolled = target_tensor.roll(1, 0) + + # Implemented as on mixup paper, page 3. + lambda_param = float( + torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0] + ) + batch_rolled.mul_(1.0 - lambda_param) + image_tensor.mul_(lambda_param).add_(batch_rolled) + + target_rolled.mul_(1.0 - lambda_param) + target_tensor.mul_(lambda_param).add_(target_rolled) + return image_tensor, target_tensor + + def __call__(self, samples, targets): + if torch.rand(1).item() >= self.p: + return samples, targets + + mixup_samples, mixup_targets = self._apply_mixup_transform( + image_tensor=samples, target_tensor=targets + ) + + return mixup_samples, mixup_targets + + def __repr__(self) -> str: + return "{}(num_classes={}, p={}, alpha={}, inplace={})".format( + self.__class__.__name__, self.num_classes, self.p, self.alpha, self.inplace + ) + + +class RandomCutmix: + """ + Based on the RandomCutmix implementation of ml_cvnets. + https://github.com/apple/ml-cvnets/blob/77717569ab4a852614dae01f010b32b820cb33bb/data/transforms/image_torch.py + + Given a batch of input images and labels, this class randomly applies the + `CutMix transformation `_ + + Args: + opts (argparse.Namespace): Arguments + num_classes (int): Number of classes in the dataset + """ + visualize = False + + def __init__(self, num_classes, alpha, p=1.0, inplace=False): + if not (num_classes > 0): + raise ValueError("Please provide a valid positive value for the num_classes.") + if not (alpha > 0): + raise ValueError("Alpha param can't be zero.") + if not (0.0 < p <= 1.0): + raise ValueError("CutMix probability should be between 0 and 1, where 1 is inclusive") + + self.num_classes = num_classes + self.alpha = alpha + self.p = p + self.inplace = inplace + + def _apply_cutmix_transform(self, image_tensor, target_tensor): + if image_tensor.ndim != 4: + raise ValueError(f"Batch ndim should be 4. Got {image_tensor.ndim}") + if target_tensor.ndim != 1: + raise ValueError(f"Target ndim should be 1. Got {target_tensor.ndim}") + if not image_tensor.is_floating_point(): + raise ValueError(f"Batch dtype should be a float tensor. Got {image_tensor.dtype}.") + if target_tensor.dtype != torch.int64: + raise ValueError(f"Target dtype should be torch.int64. Got {target_tensor.dtype}") + + if not self.inplace: + image_tensor = image_tensor.clone() + target_tensor = target_tensor.clone() + + if target_tensor.ndim == 1: + target_tensor = F_torch.one_hot( + target_tensor, num_classes=self.num_classes + ).to(dtype=image_tensor.dtype) + + # It's faster to roll the batch by one instead of shuffling it to create image pairs + batch_rolled = image_tensor.roll(1, 0) + target_rolled = target_tensor.roll(1, 0) + + # Implemented as on cutmix paper, page 12 (with minor corrections on typos). + lambda_param = float( + torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0] + ) + W, H = F.get_image_size(image_tensor) + + r_x = torch.randint(W, (1,)) + r_y = torch.randint(H, (1,)) + + r = 0.5 * math.sqrt(1.0 - lambda_param) + r_w_half = int(r * W) + r_h_half = int(r * H) + + x1 = int(torch.clamp(r_x - r_w_half, min=0)) + y1 = int(torch.clamp(r_y - r_h_half, min=0)) + x2 = int(torch.clamp(r_x + r_w_half, max=W)) + y2 = int(torch.clamp(r_y + r_h_half, max=H)) + + image_tensor[:, :, y1:y2, x1:x2] = batch_rolled[:, :, y1:y2, x1:x2] + lambda_param = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H)) + + target_rolled.mul_(1.0 - lambda_param) + target_tensor.mul_(lambda_param).add_(target_rolled) + return image_tensor, target_tensor + + def __call__(self, samples, targets) -> Dict: + if torch.rand(1).item() >= self.p: + return samples, targets + + mixup_samples, mixup_targets = self._apply_cutmix_transform( + image_tensor=samples, target_tensor=targets + ) + + return mixup_samples, mixup_targets + + def __repr__(self) -> str: + return "{}(num_classes={}, p={}, alpha={}, inplace={})".format( + self.__class__.__name__, self.num_classes, self.p, self.alpha, self.inplace + ) + + class Normalize: visualize = False @@ -351,4 +678,4 @@ def __call__(self, image, mask=None, bbox=None): return image, mask, bbox def __repr__(self): - return self.__class__.__name__ + "()" \ No newline at end of file + return self.__class__.__name__ + "()" diff --git a/src/netspresso_trainer/dataloaders/augmentation/registry.py b/src/netspresso_trainer/dataloaders/augmentation/registry.py new file mode 100644 index 00000000..139ac1a7 --- /dev/null +++ b/src/netspresso_trainer/dataloaders/augmentation/registry.py @@ -0,0 +1,29 @@ +from typing import Callable, Dict + +from .custom import ( + ColorJitter, + Pad, + RandomCrop, + RandomCutmix, + RandomErasing, + RandomHorizontalFlip, + RandomMixup, + RandomResizedCrop, + RandomVerticalFlip, + Resize, + TrivialAugmentWide, +) + +TRANSFORM_DICT: Dict[str, Callable] = { + 'colorjitter': ColorJitter, + 'pad': Pad, + 'randomcrop': RandomCrop, + 'randomresizedcrop': RandomResizedCrop, + 'randomhorizontalflip': RandomHorizontalFlip, + 'randomverticalflip': RandomVerticalFlip, + 'randomerasing': RandomErasing, + 'resize': Resize, + 'mixup': RandomMixup, + 'cutmix': RandomCutmix, + 'trivialaugmentwide': TrivialAugmentWide, +} diff --git a/src/netspresso_trainer/dataloaders/augmentation/transforms.py b/src/netspresso_trainer/dataloaders/augmentation/transforms.py new file mode 100644 index 00000000..35adbc0b --- /dev/null +++ b/src/netspresso_trainer/dataloaders/augmentation/transforms.py @@ -0,0 +1,89 @@ +import cv2 +import numpy as np +import PIL.Image as Image + +from ..utils.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from . import custom as TC +from .registry import TRANSFORM_DICT + +EDGE_SIZE = 4 +Y_K_SIZE = 6 +X_K_SIZE = 6 + + +def reduce_label(label: np.ndarray) -> Image.Image: + label[label == 0] = 255 + label = label - 1 + label[label == 254] = 255 + return Image.fromarray(label) + + +def generate_edge(label: np.ndarray) -> Image.Image: + edge = cv2.Canny(label, 0.1, 0.2) + kernel = np.ones((EDGE_SIZE, EDGE_SIZE), np.uint8) + # edge_pad == True + edge = edge[Y_K_SIZE:-Y_K_SIZE, X_K_SIZE:-X_K_SIZE] + edge = np.pad(edge, ((Y_K_SIZE, Y_K_SIZE), (X_K_SIZE, X_K_SIZE)), mode='constant') + edge = (cv2.dilate(edge, kernel, iterations=1) > 50) * 1.0 + return Image.fromarray((edge.copy() * 255).astype(np.uint8)) + + +def transforms_custom_train(conf_augmentation): + assert conf_augmentation.img_size > 32 + preprocess = [] + for augment in conf_augmentation.transforms: + name = augment.name.lower() + augment_kwargs = list(augment.keys()) + augment_kwargs.remove('name') + augment_kwargs = {k:augment[k] for k in augment_kwargs} + transform = TRANSFORM_DICT[name](**augment_kwargs) + preprocess.append(transform) + + preprocess = preprocess + [ + TC.ToTensor(), + TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) + ] + return TC.Compose(preprocess) + + +def transforms_custom_eval(conf_augmentation): + assert conf_augmentation.img_size > 32 + preprocess = [ + TC.Resize((conf_augmentation.img_size, conf_augmentation.img_size), interpolation='bilinear'), + TC.ToTensor(), + TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) + ] + return TC.Compose(preprocess) + + +def train_transforms_pidnet(conf_augmentation): + preprocess = [] + for augment in conf_augmentation.transforms: + name = augment.name.lower() + augment_kwargs = list(augment.keys()) + augment_kwargs.remove('name') + augment_kwargs = {k:augment[k] for k in augment_kwargs} + transform = TRANSFORM_DICT[name](**augment_kwargs) + preprocess.append(transform) + + preprocess = preprocess + [ + TC.ToTensor(), + TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) + ] + return TC.Compose(preprocess, additional_targets={'edge': 'mask'}) + + +def val_transforms_pidnet(conf_augmentation): + assert conf_augmentation.img_size > 32 + preprocess = [ + TC.Resize((conf_augmentation.img_size, conf_augmentation.img_size), interpolation='bilinear'), + TC.ToTensor(), + TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) + ] + return TC.Compose(preprocess, additional_targets={'edge': 'mask'}) + + +def create_transform(model_name: str, is_training=False): + if 'pidnet' in model_name: + return train_transforms_pidnet if is_training else val_transforms_pidnet + return transforms_custom_train if is_training else transforms_custom_eval diff --git a/src/netspresso_trainer/dataloaders/base.py b/src/netspresso_trainer/dataloaders/base.py index 5e1e8b11..e00bb5a1 100644 --- a/src/netspresso_trainer/dataloaders/base.py +++ b/src/netspresso_trainer/dataloaders/base.py @@ -15,10 +15,10 @@ def __init__(self, conf_data, conf_augmentation, model_name, idx_to_class, split self.conf_data = conf_data self.conf_augmentation = conf_augmentation self.model_name = model_name - + self.transform = transform self.samples = samples - + self._root = conf_data.path.root self._idx_to_class = idx_to_class self._num_classes = len(self._idx_to_class) @@ -47,12 +47,12 @@ def root(self): @property def mode(self): return self._split - + @property def with_label(self): return self._with_label - - + + class BaseHFDataset(data.Dataset): def __init__(self, conf_data, conf_augmentation, model_name, root, split, with_label): @@ -64,7 +64,7 @@ def __init__(self, conf_data, conf_augmentation, model_name, root, split, with_l self._split = split self._with_label = with_label - def _load_dataset(self, root, subset_name=None, cache_dir=None): + def _load_dataset(self, root, subset_name=None, cache_dir=None): from datasets import load_dataset if cache_dir is not None: Path(cache_dir).mkdir(exist_ok=True, parents=True) @@ -94,7 +94,7 @@ def root(self): @property def mode(self): return self._split - + @property def with_label(self): return self._with_label @@ -104,15 +104,15 @@ class BaseDataSampler(ABC): def __init__(self, conf_data, train_valid_split_ratio): self.conf_data = conf_data self.train_valid_split_ratio = train_valid_split_ratio - + @abstractmethod def load_data(self): raise NotImplementedError - + @abstractmethod def load_samples(self): raise NotImplementedError - + @abstractmethod def load_huggingface_samples(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/src/netspresso_trainer/dataloaders/builder.py b/src/netspresso_trainer/dataloaders/builder.py index f13a7fea..3910ec45 100644 --- a/src/netspresso_trainer/dataloaders/builder.py +++ b/src/netspresso_trainer/dataloaders/builder.py @@ -1,8 +1,11 @@ import logging import os +from functools import partial from pathlib import Path from typing import Dict, List, Optional, Type, Union +from .augmentation.registry import TRANSFORM_DICT +from .classification import classification_mix_collate_fn, classification_onehot_collate_fn from .detection import detection_collate_fn from .registry import CREATE_TRANSFORM, CUSTOM_DATASET, DATA_SAMPLER, HUGGINGFACE_DATASET from .utils.loader import create_loader @@ -19,11 +22,10 @@ def build_dataset(conf_data, conf_augmentation, task: str, model_name: str): task = conf_data.task - assert task in CREATE_TRANSFORM, f"The given task `{task}` is not supported!" assert task in DATA_SAMPLER, f"Data sampler for {task} is not yet supported!" - train_transform = CREATE_TRANSFORM[task](model_name, is_training=True) - target_transform = CREATE_TRANSFORM[task](model_name, is_training=False) + train_transform = CREATE_TRANSFORM(model_name, is_training=True) + target_transform = CREATE_TRANSFORM(model_name, is_training=False) data_format = conf_data.format @@ -101,7 +103,25 @@ def build_dataset(conf_data, conf_augmentation, task: str, model_name: str): def build_dataloader(conf, task: str, model_name: str, train_dataset, eval_dataset, profile=False): if task == 'classification': - collate_fn = None + conf_mix_transform = getattr(conf.augmentation, 'mix_transforms', None) + if conf_mix_transform: + mix_transforms = [] + for mix_transform_conf in conf.augmentation.mix_transforms: + name = mix_transform_conf.name.lower() + + mix_kwargs = list(mix_transform_conf.keys()) + mix_kwargs.remove('name') + mix_kwargs = {k:mix_transform_conf[k] for k in mix_kwargs} + mix_kwargs['num_classes'] = train_dataset.num_classes + + transform = TRANSFORM_DICT[name](**mix_kwargs) + mix_transforms.append(transform) + + train_collate_fn = partial(classification_mix_collate_fn, mix_transforms=mix_transforms) + eval_collate_fn = partial(classification_onehot_collate_fn, num_classes=train_dataset.num_classes) + else: + train_collate_fn = None + eval_collate_fn = None train_loader = create_loader( train_dataset, @@ -112,7 +132,7 @@ def build_dataloader(conf, task: str, model_name: str, train_dataset, eval_datas is_training=True, num_workers=conf.environment.num_workers if not profile else 1, distributed=conf.distributed, - collate_fn=collate_fn, + collate_fn=train_collate_fn, pin_memory=False, world_size=conf.world_size, rank=conf.rank, @@ -128,7 +148,7 @@ def build_dataloader(conf, task: str, model_name: str, train_dataset, eval_datas is_training=False, num_workers=conf.environment.num_workers if not profile else 1, distributed=conf.distributed, - collate_fn=None, + collate_fn=eval_collate_fn, pin_memory=False, world_size=conf.world_size, rank=conf.rank, diff --git a/src/netspresso_trainer/dataloaders/classification/__init__.py b/src/netspresso_trainer/dataloaders/classification/__init__.py index b297f1a0..8618218e 100644 --- a/src/netspresso_trainer/dataloaders/classification/__init__.py +++ b/src/netspresso_trainer/dataloaders/classification/__init__.py @@ -1,4 +1,3 @@ -from .dataset import ClassficationDataSampler +from .dataset import ClassficationDataSampler, classification_mix_collate_fn, classification_onehot_collate_fn from .huggingface import ClassificationHFDataset from .local import ClassificationCustomDataset -from .transforms import create_transform_classification diff --git a/src/netspresso_trainer/dataloaders/classification/dataset.py b/src/netspresso_trainer/dataloaders/classification/dataset.py index cd42daa2..298aeb0d 100644 --- a/src/netspresso_trainer/dataloaders/classification/dataset.py +++ b/src/netspresso_trainer/dataloaders/classification/dataset.py @@ -1,5 +1,6 @@ import csv import logging +import random from collections import Counter from itertools import chain from pathlib import Path @@ -7,6 +8,7 @@ import torch from omegaconf import DictConfig +from torch.nn import functional as F from torch.utils.data import random_split from ..base import BaseDataSampler @@ -16,21 +18,21 @@ logger = logging.getLogger("netspresso_trainer") VALID_IMG_EXTENSIONS = IMG_EXTENSIONS + tuple((x.upper() for x in IMG_EXTENSIONS)) - + def load_class_map_with_id_mapping(root_dir, train_dir, map_or_filename: Optional[Union[str, Path]]=None, id_mapping: Optional[Dict[str, str]]=None): if map_or_filename is None: # may be labeled with directory - # dir -> + # dir -> dir_list = [x.name for x in Path(train_dir).iterdir() if x.is_dir()] dir_to_class = id_mapping if id_mapping is not None else {k: k for k in dir_list} # id_mapping or identity - + class_list = [dir_to_class[dir] for dir in dir_list] class_list = sorted(class_list, key=lambda k: natural_key(k)) _class_to_idx = {class_name: class_idx for class_idx, class_name in enumerate(class_list)} idx_to_class = {v: k for k, v in _class_to_idx.items()} - + file_or_dir_to_idx = {dir: _class_to_idx[dir_to_class[dir]] for dir in dir_list} # dir -> idx return file_or_dir_to_idx, idx_to_class @@ -45,9 +47,9 @@ def load_class_map_with_id_mapping(root_dir, train_dir, reader = csv.DictReader(csvfile) file_class_list = [{column: str(row[column]).strip() for column in ['image_id', 'class']} for row in reader] - + class_stats = Counter([x['class'] for x in file_class_list]) - + _class_to_idx = {class_name: class_idx for class_idx, class_name in enumerate(sorted(class_stats, key=lambda k: natural_key(k)))} idx_to_class = {v: k for k, v in _class_to_idx.items()} @@ -62,26 +64,59 @@ def is_file_dict(image_dir: Union[Path, str], file_or_dir_to_idx): file_or_dir: Path = image_dir / candidate_name if file_or_dir.exists(): return file_or_dir.is_file() - + file_candidates = list(image_dir.glob(f"{candidate_name}.*")) assert len(file_candidates) != 0, f"Unknown label format! Is there any something file like {file_or_dir} ?" - + return True + +def classification_mix_collate_fn(original_batch, mix_transforms): + images = [] + target = [] + for data_sample in original_batch: + images.append(data_sample[0]) + target.append(data_sample[1]) + + images = torch.stack(images, dim=0) + target = torch.tensor(target, dtype=torch.long) + + _mix_transform = random.choice(mix_transforms) + images, target = _mix_transform(images, target) + + outputs = (images, target) + return outputs + + +def classification_onehot_collate_fn(original_batch, num_classes): + images = [] + target = [] + for data_sample in original_batch: + images.append(data_sample[0]) + target.append(data_sample[1]) + + images = torch.stack(images, dim=0) + target = torch.tensor(target, dtype=torch.long) + target = F.one_hot(target, num_classes=num_classes).to(dtype=images.dtype) + + outputs = (images, target) + return outputs + + class ClassficationDataSampler(BaseDataSampler): def __init__(self, conf_data, train_valid_split_ratio): super(ClassficationDataSampler, self).__init__(conf_data, train_valid_split_ratio) - + def load_data(self, file_or_dir_to_idx, split='train'): data_root = Path(self.conf_data.path.root) split_dir = self.conf_data.path[split] image_dir: Path = data_root / split_dir.image - + images_and_targets: List[Dict[str, Optional[Union[str, int]]]] = [] - + assert split in ['train', 'valid', 'test'], f"split should be either {['train', 'valid', 'test']}" if split in ['train', 'valid']: - + if is_file_dict(image_dir, file_or_dir_to_idx): file_to_idx = file_or_dir_to_idx for file in chain(image_dir.glob(f'*{ext}') for ext in VALID_IMG_EXTENSIONS): @@ -92,7 +127,7 @@ def load_data(self, file_or_dir_to_idx, split='train'): images_and_targets.append({'image': str(file), 'label': file_to_idx[file.stem]}) continue logger.debug(f"Found file wihtout label: {file}") - + else: dir_to_idx = file_or_dir_to_idx for dir_name, dir_idx in dir_to_idx.items(): @@ -103,24 +138,24 @@ def load_data(self, file_or_dir_to_idx, split='train'): else: # split == test for ext in VALID_IMG_EXTENSIONS: images_and_targets.extend([{'image': str(file), 'label': None} for file in image_dir.glob(f'*{ext}')]) - + images_and_targets = sorted(images_and_targets, key=lambda k: natural_key(k['image'])) return images_and_targets - + def load_samples(self): assert self.conf_data.path.train.image is not None root_dir = Path(self.conf_data.path.root) train_dir = root_dir / self.conf_data.path.train.image id_mapping: Optional[dict] = dict(self.conf_data.id_mapping) if self.conf_data.id_mapping is not None else None file_or_dir_to_idx, idx_to_class = load_class_map_with_id_mapping(root_dir, train_dir, map_or_filename=self.conf_data.path.train.label, id_mapping=id_mapping) - + exists_valid = self.conf_data.path.valid.image is not None exists_test = self.conf_data.path.test.image is not None - + valid_samples = None test_samples = None - + train_samples = self.load_data(file_or_dir_to_idx, split='train') if exists_valid: valid_samples = self.load_data(file_or_dir_to_idx, split='valid') @@ -128,16 +163,16 @@ def load_samples(self): test_samples = self.load_data(file_or_dir_to_idx, split='test') if not exists_valid: - num_train_splitted = int(len(train_samples) * self.train_valid_split_ratio) + num_train_splitted = int(len(train_samples) * self.train_valid_split_ratio) train_samples, valid_samples = \ random_split(train_samples, [num_train_splitted, len(train_samples) - num_train_splitted], generator=torch.Generator().manual_seed(42)) - + return train_samples, valid_samples, test_samples, {'idx_to_class': idx_to_class} - + def load_huggingface_samples(self): from datasets import ClassLabel, load_dataset - + cache_dir = self.conf_data.metadata.custom_cache_dir root = self.conf_data.metadata.repo subset_name = self.conf_data.metadata.subset @@ -145,23 +180,23 @@ def load_huggingface_samples(self): cache_dir = Path(cache_dir) Path(cache_dir).mkdir(exist_ok=True, parents=True) total_dataset = load_dataset(root, name=subset_name, cache_dir=cache_dir) - + label_feature_name = self.conf_data.metadata.features.label label_feature = total_dataset['train'].features[label_feature_name] if isinstance(label_feature, ClassLabel): labels: List[str] = label_feature.names else: labels = list({sample[label_feature_name] for sample in total_dataset['train']}) - + if isinstance(labels[0], int): # TODO: find class_map <-> idx and apply it (ex. using id_mapping) idx_to_class: Dict[int, int] = {k: k for k in labels} elif isinstance(labels[0], str): idx_to_class: Dict[int, str] = dict(enumerate(labels)) - + exists_valid = 'validation' in total_dataset exists_test = 'test' in total_dataset - + train_samples = total_dataset['train'] valid_samples = None if exists_valid: @@ -174,4 +209,4 @@ def load_huggingface_samples(self): splitted_datasets = train_samples.train_test_split(test_size=(1 - self.train_valid_split_ratio)) train_samples = splitted_datasets['train'] valid_samples = splitted_datasets['test'] - return train_samples, valid_samples, test_samples, {'idx_to_class': idx_to_class} \ No newline at end of file + return train_samples, valid_samples, test_samples, {'idx_to_class': idx_to_class} diff --git a/src/netspresso_trainer/dataloaders/classification/transforms.py b/src/netspresso_trainer/dataloaders/classification/transforms.py deleted file mode 100644 index 9ac5f000..00000000 --- a/src/netspresso_trainer/dataloaders/classification/transforms.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import Optional - -from torchvision.transforms.functional import InterpolationMode - -from ..augmentation import custom as TC -from ..utils.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD - - -def transforms_custom_train(conf_augmentation): - assert conf_augmentation.img_size > 32 - primary_tfl = [TC.RandomResizedCrop(conf_augmentation.img_size, interpolation=InterpolationMode.BILINEAR), - TC.RandomHorizontalFlip(p=conf_augmentation.fliplr) - ] - preprocess = [ - TC.ToTensor(), - TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) - ] - return TC.Compose(primary_tfl + preprocess) - - -def transforms_custom_eval(conf_augmentation): - assert conf_augmentation.img_size > 32 - preprocess = [ - TC.Resize((conf_augmentation.img_size, conf_augmentation.img_size), - interpolation=InterpolationMode.BILINEAR), - TC.ToTensor(), - TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) - ] - return TC.Compose(preprocess) - - -def create_transform_classification(model_name: str, is_training=False): - return transforms_custom_train if is_training else transforms_custom_eval diff --git a/src/netspresso_trainer/dataloaders/detection/__init__.py b/src/netspresso_trainer/dataloaders/detection/__init__.py index 9234262e..38587950 100644 --- a/src/netspresso_trainer/dataloaders/detection/__init__.py +++ b/src/netspresso_trainer/dataloaders/detection/__init__.py @@ -1,3 +1,2 @@ from .dataset import DetectionDataSampler, detection_collate_fn from .local import DetectionCustomDataset -from .transforms import create_transform_detection diff --git a/src/netspresso_trainer/dataloaders/detection/dataset.py b/src/netspresso_trainer/dataloaders/detection/dataset.py index fd6dbcf2..76984375 100644 --- a/src/netspresso_trainer/dataloaders/detection/dataset.py +++ b/src/netspresso_trainer/dataloaders/detection/dataset.py @@ -49,7 +49,7 @@ def detection_collate_fn(original_batch): class DetectionDataSampler(BaseDataSampler): def __init__(self, conf_data, train_valid_split_ratio): super(DetectionDataSampler, self).__init__(conf_data, train_valid_split_ratio) - + def load_data(self, split='train'): data_root = Path(self.conf_data.path.root) split_dir = self.conf_data.path[split] @@ -71,7 +71,7 @@ def load_data(self, split='train'): images = sorted(images, key=lambda k: natural_key(k)) labels = sorted(labels, key=lambda k: natural_key(k)) images_and_targets.extend([{'image': str(image), 'label': str(label)} for image, label in zip(images, labels)]) - + elif split == 'test': for ext in IMG_EXTENSIONS: images_and_targets.extend([{'image': str(file), 'label': None} @@ -79,21 +79,21 @@ def load_data(self, split='train'): images_and_targets = sorted(images_and_targets, key=lambda k: natural_key(k['image'])) else: raise AssertionError(f"split should be either {['train', 'valid', 'test']}") - + return images_and_targets - + def load_samples(self): assert self.conf_data.path.train.image is not None assert self.conf_data.id_mapping is not None id_mapping: Optional[list] = list(self.conf_data.id_mapping) idx_to_class = load_custom_class_map(id_mapping=id_mapping) - + exists_valid = self.conf_data.path.valid.image is not None exists_test = self.conf_data.path.test.image is not None - + valid_samples = None test_samples = None - + train_samples = self.load_data(split='train') if exists_valid: valid_samples = self.load_data(split='valid') @@ -101,12 +101,12 @@ def load_samples(self): test_samples = self.load_data(split='test') if not exists_valid: - num_train_splitted = int(len(train_samples) * self.train_valid_split_ratio) + num_train_splitted = int(len(train_samples) * self.train_valid_split_ratio) train_samples, valid_samples = \ random_split(train_samples, [num_train_splitted, len(train_samples) - num_train_splitted], generator=torch.Generator().manual_seed(42)) - + return train_samples, valid_samples, test_samples, {'idx_to_class': idx_to_class} - + def load_huggingface_samples(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/src/netspresso_trainer/dataloaders/detection/local.py b/src/netspresso_trainer/dataloaders/detection/local.py index 2ba29762..3dbbe842 100644 --- a/src/netspresso_trainer/dataloaders/detection/local.py +++ b/src/netspresso_trainer/dataloaders/detection/local.py @@ -23,13 +23,13 @@ def exist_name(candidate, folder_iterable): def get_label(label_file: Path): target = Path(label_file).read_text() - + try: target_array = np.array([list(map(float, box.split(' '))) for box in target.split('\n') if box.strip()]) except ValueError as e: print(target) raise e - + label, boxes = target_array[:, 0], target_array[:, 1:] label = label[..., np.newaxis] return label, boxes @@ -43,7 +43,7 @@ def __init__(self, conf_data, conf_augmentation, model_name, idx_to_class, conf_data, conf_augmentation, model_name, idx_to_class, split, samples, transform, with_label, **kwargs ) - + @staticmethod def xywhn2xyxy(original: np.ndarray, w: int, h: int, padw=0, padh=0): converted = original.copy() @@ -67,12 +67,12 @@ def __getitem__(self, index): if ann_path is None: out = self.transform(self.conf_augmentation)(image=img) return {'pixel_values': out['image'], 'name': img_path.name, 'org_img': org_img, 'org_shape': (h, w)} - + outputs = {} label, boxes_yolo = get_label(Path(ann_path)) boxes = self.xywhn2xyxy(boxes_yolo, w, h) - + out = self.transform(self.conf_augmentation)(image=img, bbox=np.concatenate((boxes, label), axis=-1)) assert out['bbox'].shape[-1] == 5 # ltrb + class_label outputs.update({'pixel_values': out['image'], 'bbox': out['bbox'][..., :4], @@ -83,6 +83,6 @@ def __getitem__(self, index): return outputs assert self._split in ['val', 'valid', 'test'] - # outputs.update({'org_img': org_img, 'org_shape': (h, w)}) # TODO: return org_img with batch_size > 1 + # outputs.update({'org_img': org_img, 'org_shape': (h, w)}) # TODO: return org_img with batch_size > 1 outputs.update({'org_shape': (h, w)}) - return outputs \ No newline at end of file + return outputs diff --git a/src/netspresso_trainer/dataloaders/detection/transforms.py b/src/netspresso_trainer/dataloaders/detection/transforms.py deleted file mode 100644 index ac3090a5..00000000 --- a/src/netspresso_trainer/dataloaders/detection/transforms.py +++ /dev/null @@ -1,40 +0,0 @@ -from typing import Optional - -import cv2 -import numpy as np -import PIL.Image as Image - -from ..augmentation import custom as TC -from ..utils.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD - - -def train_transforms_efficientformer(conf_augmentation): - - crop_size_h = conf_augmentation.crop_size_h - crop_size_w = conf_augmentation.crop_size_w - - train_transforms_composed = TC.Compose([ - TC.Resize(size=(crop_size_h, crop_size_w)), - TC.ToTensor(), - TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) - ]) - - return train_transforms_composed - -def val_transforms_efficientformer(conf_augmentation): - - crop_size_h = conf_augmentation.crop_size_h - crop_size_w = conf_augmentation.crop_size_w - - val_transforms_composed = TC.Compose([ - TC.Resize(size=(crop_size_h, crop_size_w)), - TC.ToTensor(), - TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) - ]) - - return val_transforms_composed - -def create_transform_detection(model_name: str, is_training=False): - if is_training: - return train_transforms_efficientformer - return val_transforms_efficientformer diff --git a/src/netspresso_trainer/dataloaders/registry.py b/src/netspresso_trainer/dataloaders/registry.py index bc71e992..7ff3cc42 100644 --- a/src/netspresso_trainer/dataloaders/registry.py +++ b/src/netspresso_trainer/dataloaders/registry.py @@ -1,26 +1,20 @@ from typing import Callable, Dict, Type -from .augmentation import custom as TC +from .augmentation.transforms import create_transform from .base import BaseCustomDataset, BaseDataSampler, BaseHFDataset from .classification import ( ClassficationDataSampler, ClassificationCustomDataset, ClassificationHFDataset, - create_transform_classification, ) -from .detection import DetectionCustomDataset, DetectionDataSampler, create_transform_detection +from .detection import DetectionCustomDataset, DetectionDataSampler from .segmentation import ( SegmentationCustomDataset, SegmentationDataSampler, SegmentationHFDataset, - create_transform_segmentation, ) -CREATE_TRANSFORM: Dict[str, Callable[..., Callable[..., TC.Compose]]] = { - 'classification': create_transform_classification, - 'segmentation': create_transform_segmentation, - 'detection': create_transform_detection -} +CREATE_TRANSFORM = create_transform CUSTOM_DATASET: Dict[str, Type[BaseCustomDataset]] = { 'classification': ClassificationCustomDataset, @@ -37,4 +31,4 @@ 'classification': ClassficationDataSampler, 'segmentation': SegmentationDataSampler, 'detection': DetectionDataSampler -} \ No newline at end of file +} diff --git a/src/netspresso_trainer/dataloaders/segmentation/__init__.py b/src/netspresso_trainer/dataloaders/segmentation/__init__.py index efed9d1d..9d73a030 100644 --- a/src/netspresso_trainer/dataloaders/segmentation/__init__.py +++ b/src/netspresso_trainer/dataloaders/segmentation/__init__.py @@ -1,4 +1,3 @@ from .dataset import SegmentationDataSampler from .huggingface import SegmentationHFDataset from .local import SegmentationCustomDataset -from .transforms import create_transform_segmentation diff --git a/src/netspresso_trainer/dataloaders/segmentation/huggingface.py b/src/netspresso_trainer/dataloaders/segmentation/huggingface.py index 8c494307..eeb6afc6 100644 --- a/src/netspresso_trainer/dataloaders/segmentation/huggingface.py +++ b/src/netspresso_trainer/dataloaders/segmentation/huggingface.py @@ -3,8 +3,8 @@ import numpy as np import PIL.Image as Image +from ..augmentation.transforms import generate_edge, reduce_label from ..base import BaseHFDataset -from ..segmentation.transforms import generate_edge, reduce_label class SegmentationHFDataset(BaseHFDataset): diff --git a/src/netspresso_trainer/dataloaders/segmentation/local.py b/src/netspresso_trainer/dataloaders/segmentation/local.py index 1ab8d305..b39bd8b1 100644 --- a/src/netspresso_trainer/dataloaders/segmentation/local.py +++ b/src/netspresso_trainer/dataloaders/segmentation/local.py @@ -5,8 +5,8 @@ import numpy as np import PIL.Image as Image +from ..augmentation.transforms import generate_edge, reduce_label from ..base import BaseCustomDataset -from ..segmentation.transforms import generate_edge, reduce_label class SegmentationCustomDataset(BaseCustomDataset): @@ -51,7 +51,7 @@ def __getitem__(self, index): mask = Image.fromarray(mask, mode='L') # single mode array (PIL.Image) compatbile with torchvision transform API - if self.model_name == 'pidnet': + if 'pidnet' in self.model_name: edge = generate_edge(np.array(mask)) out = self.transform(self.conf_augmentation)(image=img, mask=mask, edge=edge) outputs.update({'pixel_values': out['image'], 'labels': out['mask'], 'edges': out['edge'].float(), 'name': img_path.name}) diff --git a/src/netspresso_trainer/dataloaders/segmentation/transforms.py b/src/netspresso_trainer/dataloaders/segmentation/transforms.py deleted file mode 100644 index d4aa506c..00000000 --- a/src/netspresso_trainer/dataloaders/segmentation/transforms.py +++ /dev/null @@ -1,119 +0,0 @@ -from typing import Optional - -import cv2 -import numpy as np -import PIL.Image as Image - -from ..augmentation import custom as TC -from ..utils.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD - -EDGE_SIZE = 4 -Y_K_SIZE = 6 -X_K_SIZE = 6 - - -def reduce_label(label: np.ndarray) -> Image.Image: - label[label == 0] = 255 - label = label - 1 - label[label == 254] = 255 - return Image.fromarray(label) - -def generate_edge(label: np.ndarray) -> Image.Image: - edge = cv2.Canny(label, 0.1, 0.2) - kernel = np.ones((EDGE_SIZE, EDGE_SIZE), np.uint8) - # edge_pad == True - edge = edge[Y_K_SIZE:-Y_K_SIZE, X_K_SIZE:-X_K_SIZE] - edge = np.pad(edge, ((Y_K_SIZE, Y_K_SIZE), (X_K_SIZE, X_K_SIZE)), mode='constant') - edge = (cv2.dilate(edge, kernel, iterations=1) > 50) * 1.0 - return Image.fromarray((edge.copy() * 255).astype(np.uint8)) - - -def train_transforms_segmentation(conf_augmentation): - - crop_size_h = conf_augmentation.crop_size_h - crop_size_w = conf_augmentation.crop_size_w - - scale_ratio = (conf_augmentation.resize_ratio0, conf_augmentation.resize_ratiof) - - train_transforms_composed = TC.Compose([ - TC.RandomResizedCrop((crop_size_h, crop_size_w), scale=scale_ratio, ratio=(1.0, 1.0)), - TC.RandomHorizontalFlip(p=conf_augmentation.fliplr), - TC.ColorJitter(brightness=conf_augmentation.color_jitter.brightness, - contrast=conf_augmentation.color_jitter.contrast, - saturation=conf_augmentation.color_jitter.saturation, - hue=conf_augmentation.color_jitter.hue, - p=conf_augmentation.color_jitter.colorjitter_p), - TC.ToTensor(), - TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) - ]) - - return train_transforms_composed - -def val_transforms_segmentation(conf_augmentation): - - crop_size_h = conf_augmentation.crop_size_h - crop_size_w = conf_augmentation.crop_size_w - - val_transforms_composed = TC.Compose([ - TC.Resize((crop_size_h, crop_size_w)), - TC.ToTensor(), - TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) - ]) - - return val_transforms_composed - - -def infer_transforms_segmentation(conf_augmentation): - return - - -def train_transforms_pidnet(conf_augmentation): - - crop_size_h = conf_augmentation.crop_size_h - crop_size_w = conf_augmentation.crop_size_w - - scale_ratio = (conf_augmentation.resize_ratio0, conf_augmentation.resize_ratiof) - - train_transforms_composed = TC.Compose( - [ - TC.RandomResizedCrop((crop_size_h, crop_size_w), scale=scale_ratio, ratio=(1.0, 1.0)), - TC.RandomHorizontalFlip(p=conf_augmentation.fliplr), - TC.ToTensor(), - TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) - ], - additional_targets={'edge': 'mask'} - ) - - return train_transforms_composed - - -def val_transforms_pidnet(conf_augmentation): - - crop_size_h = conf_augmentation.crop_size_h - crop_size_w = conf_augmentation.crop_size_w - - val_transforms_composed = TC.Compose( - [ - TC.Resize((crop_size_h, crop_size_w)), - TC.ToTensor(), - TC.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) - ], - additional_targets={'edge': 'mask'} - ) - - return val_transforms_composed - - -def infer_transforms_pidnet(conf_augmentation): - return - - -def create_transform_segmentation(model_name: str, is_training=False): - - if model_name == 'pidnet': - if is_training: - return train_transforms_pidnet - return val_transforms_pidnet - if is_training: - return train_transforms_segmentation - return val_transforms_segmentation diff --git a/src/netspresso_trainer/dataloaders/utils/constants.py b/src/netspresso_trainer/dataloaders/utils/constants.py index b017eb44..91d73f29 100644 --- a/src/netspresso_trainer/dataloaders/utils/constants.py +++ b/src/netspresso_trainer/dataloaders/utils/constants.py @@ -1,4 +1,4 @@ -DEFAULT_CROP_PCT = 0.95 #0.875 +DEFAULT_CROP_PCT = 0.95 #0.875 IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) diff --git a/src/netspresso_trainer/dataloaders/utils/misc.py b/src/netspresso_trainer/dataloaders/utils/misc.py index 0fab867f..3fb73621 100644 --- a/src/netspresso_trainer/dataloaders/utils/misc.py +++ b/src/netspresso_trainer/dataloaders/utils/misc.py @@ -19,4 +19,4 @@ def expand_to_chs(x, n): def natural_key(string_): """See http://www.codinghorror.com/blog/archives/001018.html""" - return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())] \ No newline at end of file + return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())] diff --git a/src/netspresso_trainer/loggers/base.py b/src/netspresso_trainer/loggers/base.py index dffa31d6..6dd321da 100644 --- a/src/netspresso_trainer/loggers/base.py +++ b/src/netspresso_trainer/loggers/base.py @@ -13,41 +13,41 @@ def __init__(self, model, result_dir): self.model = model self.csv_path = Path(result_dir) / CSV_FILENAME self.header: List = [] - + self._temp_row_dict = {} - + if self.csv_path.exists(): self.csv_path.unlink() - + self._epoch = None - + @property @abstractmethod def key_map(self) -> Dict[str, str]: raise NotImplementedError - + def init_epoch(self): self._epoch = 0 - + @property def epoch(self): return self._epoch - + @epoch.setter def epoch(self, value: int) -> None: self._epoch = int(value) - + def update_header(self, header: List): assert len(header) != 0 self.header = header - + with open(self.csv_path, 'a') as f: f.write(",".join(self.header)) f.write("\n") def _clear_temp(self): self._temp_row_dict = {} - + def _update_with_list(self, data: List): if data is not None and len(data) != 0: with open(self.csv_path, 'a') as f: @@ -55,18 +55,18 @@ def _update_with_list(self, data: List): f.write("\n") self._clear_temp() return - + def _update_specific(self, data: Dict): for _key, _value in data.items(): if _key not in self.header: raise AssertionError(f"The given key ({_key}) is not in {self.header}!") if _key not in self._temp_row_dict: self._temp_row_dict[_key] = _value - + if set(self.header) == set(self._temp_row_dict.keys()): self._update_with_list([self._temp_row_dict[_col] for _col in self.header]) return - + def update(self, data=None, **kwargs): if isinstance(data, List): return self._update_with_list(data) @@ -74,9 +74,9 @@ def update(self, data=None, **kwargs): return self._update_specific(data) # if isinstance(data, type(None)): # return self._update_specific(kwargs) - + raise AssertionError(f"Type of data should be either List or Dict! Current: {type(data)}") - + def _convert_as_csv_record(self, scalar_dict: Dict, prefix: Literal['train', 'valid'] = 'train'): converted_dict = {} for k, v in scalar_dict.items(): @@ -84,25 +84,25 @@ def _convert_as_csv_record(self, scalar_dict: Dict, prefix: Literal['train', 'va continue record_key = self.key_map[f"{prefix}/{k}"] assert record_key in self.header, f"{record_key} not in {self.header}" - + converted_dict.update({record_key: v}) return converted_dict - + def __call__(self, train_losses, train_metrics, valid_losses=None, valid_metrics=None): assert len(self.header) != 0 assert len(self.key_map) != 0 - + csv_record_dict = {'epoch': self._epoch} converted_train_losses = self._convert_as_csv_record(train_losses, prefix='train') converted_train_metrics = self._convert_as_csv_record(train_metrics, prefix='train') csv_record_dict.update(converted_train_losses) csv_record_dict.update(converted_train_metrics) - + if valid_losses is not None: converted_valid_losses = self._convert_as_csv_record(valid_losses, prefix='valid') csv_record_dict.update(converted_valid_losses) if valid_metrics is not None: converted_valid_metrics = self._convert_as_csv_record(valid_metrics, prefix='valid') csv_record_dict.update(converted_valid_metrics) - + self.update(csv_record_dict) diff --git a/src/netspresso_trainer/loggers/builder.py b/src/netspresso_trainer/loggers/builder.py index 912044bb..63f60253 100644 --- a/src/netspresso_trainer/loggers/builder.py +++ b/src/netspresso_trainer/loggers/builder.py @@ -62,16 +62,16 @@ def __init__( step_per_epoch=step_per_epoch, num_sample_images=num_sample_images) if self.use_tensorboard else None self.stdout_logger: Optional[StdOutLogger] = \ StdOutLogger(task=task, model=model, total_epochs=conf.training.epochs) if self.use_stdout else None - + self.netspresso_api_client = None if self.use_netspresso: from loggers.netspresso import ModelSearchServerHandler self.netspresso_api_client: Optional[ModelSearchServerHandler] = ModelSearchServerHandler(task=task, model=model) - + if task in VISUALIZER: pallete = conf.data.pallete if 'pallete' in conf.data else None self.label_converter = VISUALIZER[task](class_map=class_map, pallete=pallete) - + @property def result_dir(self): return self._result_dir @@ -117,7 +117,7 @@ def _convert_imagedict_as_readable(self, images_dict: Dict): for k, v in images_dict.items(): if k == 'images': continue - + # target, pred, bg_gt v = v[:self.num_sample_images] v_new: np.ndarray = magic_image_handler( diff --git a/src/netspresso_trainer/loggers/csv.py b/src/netspresso_trainer/loggers/csv.py index 943acb29..d82f629a 100644 --- a/src/netspresso_trainer/loggers/csv.py +++ b/src/netspresso_trainer/loggers/csv.py @@ -6,7 +6,7 @@ class ClassificationCSVLogger(BaseCSVLogger): def __init__(self, model, result_dir): super(ClassificationCSVLogger, self).__init__(model, result_dir) self.update_header(self.csv_header) - + self._key_map = { 'epoch': 'epoch', 'train/total': 'train_loss', @@ -14,7 +14,7 @@ def __init__(self, model, result_dir): 'train/Acc@1': 'train_accuracy', 'valid/Acc@1': 'valid_accuracy', } - + @property def key_map(self): return self._key_map @@ -24,7 +24,7 @@ class SegmentationCSVLogger(BaseCSVLogger): def __init__(self, model, result_dir): super(SegmentationCSVLogger, self).__init__(model, result_dir) self.update_header(self.csv_header) - + self._key_map = { 'epoch': 'epoch', 'train/total': 'train_loss', @@ -35,4 +35,4 @@ def __init__(self, model, result_dir): @property def key_map(self): - return self._key_map \ No newline at end of file + return self._key_map diff --git a/src/netspresso_trainer/loggers/image.py b/src/netspresso_trainer/loggers/image.py index 68f6c23e..cf9bb0e1 100644 --- a/src/netspresso_trainer/loggers/image.py +++ b/src/netspresso_trainer/loggers/image.py @@ -12,32 +12,32 @@ def __init__(self, model, result_dir) -> None: self.save_dir: Path = Path(result_dir) / "result_image" self.save_dir.mkdir(exist_ok=True) self._epoch = None - + def init_epoch(self): self._epoch = 0 - + @property def epoch(self): return self._epoch - + @epoch.setter def epoch(self, value: int) -> None: self._epoch = int(value) - + def save_ndarray_as_image(self, image_array: np.ndarray, filename: Union[str, Path], dataformats: Literal['HWC', 'CHW'] = 'HWC'): assert image_array.ndim == 3 if dataformats != 'HWC' and dataformats == 'CHW': image_array = image_array.transpose((1, 2, 0)) - + # HWC assert image_array.shape[-1] in [1, 3] Image.fromarray(image_array.astype(np.uint8)).save(filename) return True - + def save_result(self, image_dict: Dict, prefix='train'): prefix_dir: Path = self.save_dir / prefix prefix_dir.mkdir(exist_ok=True) - + for k, v in image_dict.items(): assert isinstance(v, np.ndarray) assert v.ndim in [3, 4], \ @@ -53,5 +53,5 @@ def __call__(self, train_images=None, valid_images=None): self.save_result(train_images, prefix='train') if valid_images is not None: self.save_result(valid_images, prefix='valid') - + diff --git a/src/netspresso_trainer/loggers/netspresso.py b/src/netspresso_trainer/loggers/netspresso.py index 8402c6f1..589a7503 100644 --- a/src/netspresso_trainer/loggers/netspresso.py +++ b/src/netspresso_trainer/loggers/netspresso.py @@ -7,7 +7,7 @@ logger = logging.getLogger("netspresso_trainer") -MONGODB_TEMP_URI = "" +MONGODB_TEMP_URI = "" class ModelSearchServerHandler: @@ -19,27 +19,27 @@ def __init__(self, task, model, mongodb_uri: str=MONGODB_TEMP_URI) -> None: logger.debug("Pinged your deployment. You successfully connected to MongoDB!") except Exception as e: raise e - + self._db = client['custom-training-board']['trainer-all-in-one'] self._session_id = None - + self._create_session(title=f"[{task}]{model}") - - + + def init_epoch(self): self._epoch = 0 - + @property def epoch(self): return self._epoch - + @epoch.setter def epoch(self, value: int) -> None: self._epoch = int(value) - + def _is_ready(self): return self._session_id is not None - + def _append(self, scalar_dict, mode='train'): assert self._is_ready() meta_string = f"{mode}/" if mode is not None else "" @@ -48,38 +48,38 @@ def _append(self, scalar_dict, mode='train'): '$currentDate': {'lastModified': True }} result = self._db.update_one({'_id': self._session_id}, contents, upsert=True) return result - + def _create_session(self, title: str ="test") -> ObjectId: example_document = { "title": title } document = self._db.insert_one(example_document) self._session_id = document.inserted_id return self._session_id - + def create_session(self, title: str="test") -> ObjectId: return self._create_session(title=title) - + def log_scalar(self, key, value, mode='train'): result = self._append({key: value}, mode=mode) return result - + def log_scalars_with_dict(self, scalar_dict, mode='train'): result = self._append(scalar_dict, mode=mode) return result - + def __call__(self, train_losses, train_metrics, valid_losses, valid_metrics, learning_rate, elapsed_time, ) -> None: - + self.log_scalars_with_dict(train_losses, mode='train') self.log_scalars_with_dict(train_metrics, mode='train') - + if valid_losses is not None: self.log_scalars_with_dict(valid_losses, mode='valid') if valid_metrics is not None: self.log_scalars_with_dict(valid_metrics, mode='valid') - + if learning_rate is not None: self.log_scalar('learning_rate', learning_rate, mode='misc') if elapsed_time is not None: - self.log_scalar('elapsed_time', elapsed_time, mode='misc') \ No newline at end of file + self.log_scalar('elapsed_time', elapsed_time, mode='misc') diff --git a/src/netspresso_trainer/loggers/registry.py b/src/netspresso_trainer/loggers/registry.py index 1ba8aad6..b8c48590 100644 --- a/src/netspresso_trainer/loggers/registry.py +++ b/src/netspresso_trainer/loggers/registry.py @@ -9,4 +9,4 @@ VISUALIZER = { 'segmentation': SegmentationVisualizer, 'detection': DetectionVisualizer, -} \ No newline at end of file +} diff --git a/src/netspresso_trainer/loggers/stdout.py b/src/netspresso_trainer/loggers/stdout.py index aa0e99ff..7e3d653d 100644 --- a/src/netspresso_trainer/loggers/stdout.py +++ b/src/netspresso_trainer/loggers/stdout.py @@ -11,21 +11,21 @@ def __init__(self, task, model, total_epochs=None) -> None: self.task = task self.model_name = model self.total_epochs = total_epochs if total_epochs is not None else "???" - + def init_epoch(self): self._epoch = 0 - + @property def epoch(self): return self._epoch - + @epoch.setter def epoch(self, value: int) -> None: self._epoch = int(value) - + def __call__(self, train_losses, train_metrics, valid_losses, valid_metrics, learning_rate, elapsed_time): logger.info(f"Epoch: {self._epoch} / {self.total_epochs}") - + if learning_rate is not None: logger.info(f"learning rate: {learning_rate:.7f}") if elapsed_time is not None: diff --git a/src/netspresso_trainer/loggers/tensorboard.py b/src/netspresso_trainer/loggers/tensorboard.py index 3ebc9c60..e905b0dd 100644 --- a/src/netspresso_trainer/loggers/tensorboard.py +++ b/src/netspresso_trainer/loggers/tensorboard.py @@ -70,14 +70,14 @@ def log_image(self, key, value: Union[np.ndarray, torch.Tensor], mode='train'): def log_images_with_dict(self, image_dict, mode='train'): for k, v in image_dict.items(): self._log_image(k, v, mode) - + def _get_rasterized_hparam(self, hparams): if not isinstance(hparams, dict): stem = hparams if not isinstance(hparams, (int, float, str, bool, torch.Tensor)): return str(stem) return stem - + rasterized_dict = {} for key, value in hparams.items(): if isinstance(value, dict): @@ -90,15 +90,15 @@ def _get_rasterized_hparam(self, hparams): return rasterized_dict def log_hparams(self, hp_omegaconf: Union[Dict, List], final_metrics=None): - + if final_metrics is None: final_metrics = {} final_metrics = {f"hparams_metrics/{k}": v for k, v in final_metrics.items()} - + hp_dict = OmegaConf.to_container(hp_omegaconf, resolve=True) hp_for_log = self._get_rasterized_hparam(hp_dict) - - exp, ssi, sei = hparams(hparam_dict=hp_for_log, metric_dict=final_metrics) + + exp, ssi, sei = hparams(hparam_dict=hp_for_log, metric_dict=final_metrics) self.tensorboard.file_writer.add_summary(exp) self.tensorboard.file_writer.add_summary(ssi) self.tensorboard.file_writer.add_summary(sei) diff --git a/src/netspresso_trainer/loggers/visualizer.py b/src/netspresso_trainer/loggers/visualizer.py index 3d11934b..349dcd68 100644 --- a/src/netspresso_trainer/loggers/visualizer.py +++ b/src/netspresso_trainer/loggers/visualizer.py @@ -55,7 +55,7 @@ def _convert(self, gray_image): return color_image def __call__(self, results: List[Tuple[np.ndarray, np.ndarray]], images=None): - + return_images = [] for image, result in zip(images, results): image = image.copy() @@ -75,12 +75,12 @@ def __call__(self, results: List[Tuple[np.ndarray, np.ndarray]], images=None): text_w, text_h = text_size image = cv2.rectangle(image, (x1, y1-5-text_h), (x1+text_w, y1), color=color, thickness=-1) image = cv2.putText(image, str(class_name), (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) - + return_images.append(image[np.newaxis, ...]) return_images = np.concatenate(return_images, axis=0) return return_images - - + + class SegmentationVisualizer: def __init__(self, class_map, pallete=None): n = len(class_map) diff --git a/src/netspresso_trainer/losses/classification/__init__.py b/src/netspresso_trainer/losses/classification/__init__.py index f072e4cc..e69de29b 100644 --- a/src/netspresso_trainer/losses/classification/__init__.py +++ b/src/netspresso_trainer/losses/classification/__init__.py @@ -1,2 +0,0 @@ -from .label_smooth import LabelSmoothingCrossEntropy -from .soft_target import SoftTargetCrossEntropy \ No newline at end of file diff --git a/src/netspresso_trainer/losses/classification/label_smooth.py b/src/netspresso_trainer/losses/classification/label_smooth.py deleted file mode 100644 index 61e2e377..00000000 --- a/src/netspresso_trainer/losses/classification/label_smooth.py +++ /dev/null @@ -1,22 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class LabelSmoothingCrossEntropy(nn.Module): - """ NLL loss with label smoothing. - """ - def __init__(self, smoothing=0.1): - super(LabelSmoothingCrossEntropy, self).__init__() - assert smoothing < 1.0 - self.smoothing = smoothing - self.confidence = 1. - smoothing - - def forward(self, out: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - pred = out['pred'] - logprobs = F.log_softmax(pred, dim=-1) - nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1)) - nll_loss = nll_loss.squeeze(1) - smooth_loss = -logprobs.mean(dim=-1) - loss = self.confidence * nll_loss + self.smoothing * smooth_loss - return loss.mean() \ No newline at end of file diff --git a/src/netspresso_trainer/losses/classification/soft_target.py b/src/netspresso_trainer/losses/classification/soft_target.py deleted file mode 100644 index 2dfc8cd0..00000000 --- a/src/netspresso_trainer/losses/classification/soft_target.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Dict - -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class SoftTargetCrossEntropy(nn.Module): # cutmix/mixup augmentation - def __init__(self): - super(SoftTargetCrossEntropy, self).__init__() - - def forward(self, out: Dict, target: torch.Tensor) -> torch.Tensor: - pred = out['pred'] - loss = torch.sum(-target * F.log_softmax(pred, dim=-1), dim=-1) - return loss.mean() \ No newline at end of file diff --git a/src/netspresso_trainer/losses/common.py b/src/netspresso_trainer/losses/common.py index eda3b8ba..fa709c00 100644 --- a/src/netspresso_trainer/losses/common.py +++ b/src/netspresso_trainer/losses/common.py @@ -1,16 +1,19 @@ -from typing import Dict +from typing import Dict, Optional import torch import torch.nn as nn import torch.nn.functional as F +from torch import Tensor class CrossEntropyLoss(nn.Module): - def __init__(self, ignore_index, **kwargs) -> None: + def __init__(self, weight: Optional[Tensor]=None, size_average=None, ignore_index: int=-100, + reduce=None, label_smoothing: float=0.0): super(CrossEntropyLoss, self).__init__() - self.loss_fn = nn.CrossEntropyLoss(ignore_index=ignore_index, **kwargs) + self.loss_fn = nn.CrossEntropyLoss(weight=weight, size_average=size_average, ignore_index=ignore_index, + reduce=reduce, reduction='mean', label_smoothing=label_smoothing) def forward(self, out: Dict, target: torch.Tensor) -> torch.Tensor: pred = out['pred'] loss = self.loss_fn(pred, target) - return loss \ No newline at end of file + return loss diff --git a/src/netspresso_trainer/losses/detection/__init__.py b/src/netspresso_trainer/losses/detection/__init__.py index 24d6942e..2d5f5d05 100644 --- a/src/netspresso_trainer/losses/detection/__init__.py +++ b/src/netspresso_trainer/losses/detection/__init__.py @@ -1,2 +1,2 @@ from .fastrcnn import RoiHeadLoss, RPNLoss -from .yolox import YOLOXLoss \ No newline at end of file +from .yolox import YOLOXLoss diff --git a/src/netspresso_trainer/losses/detection/fastrcnn.py b/src/netspresso_trainer/losses/detection/fastrcnn.py index 976a6098..28b2ab18 100644 --- a/src/netspresso_trainer/losses/detection/fastrcnn.py +++ b/src/netspresso_trainer/losses/detection/fastrcnn.py @@ -12,7 +12,7 @@ class RoiHeadLoss(nn.Module): def __init__(self) -> None: super().__init__() - + @staticmethod def forward(out: torch.Tensor, target: torch.Tensor) -> torch.Tensor: class_logits, box_regression, labels, regression_targets =\ @@ -43,10 +43,10 @@ def forward(out: torch.Tensor, target: torch.Tensor) -> torch.Tensor: "loss_classifier": classification_loss, "loss_box_reg": box_loss } - + # TODO: return as dict return sum(losses.values()) - + class RPNLoss(nn.Module): def __init__(self, box_fg_iou_thresh=0.5, @@ -54,7 +54,7 @@ def __init__(self, box_batch_size_per_image=512, box_positive_fraction=0.25) -> None: super().__init__() - + self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) self.box_similarity = box_ops.box_iou self.proposal_matcher = det_utils.Matcher( @@ -63,7 +63,7 @@ def __init__(self, allow_low_quality_matches=True, ) self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(box_batch_size_per_image, box_positive_fraction) - + def _assign_targets_to_anchors(self, anchors: List[Tensor], targets: List[Dict[str, Tensor]] ) -> Tuple[List[Tensor], List[Tensor]]: @@ -100,7 +100,7 @@ def _assign_targets_to_anchors(self, anchors: List[Tensor], targets: List[Dict[s labels.append(labels_per_image) matched_gt_boxes.append(matched_gt_boxes_per_image) return labels, matched_gt_boxes - + def _compute_loss(self, objectness: Tensor, pred_bbox_deltas: Tensor, labels: List[Tensor], regression_targets: List[Tensor] ) -> Tuple[Tensor, Tensor]: """ @@ -137,7 +137,7 @@ def _compute_loss(self, objectness: Tensor, pred_bbox_deltas: Tensor, labels: Li objectness_loss = F.binary_cross_entropy_with_logits(objectness[sampled_inds], labels[sampled_inds]) return objectness_loss, box_loss - + def forward(self, out: torch.Tensor, target: torch.Tensor) -> torch.Tensor: anchors, objectness, pred_bbox_deltas = out['anchors'], out['objectness'], out['pred_bbox_deltas'] labels, matched_gt_boxes = self._assign_targets_to_anchors(anchors, target) @@ -150,4 +150,4 @@ def forward(self, out: torch.Tensor, target: torch.Tensor) -> torch.Tensor: "loss_rpn_box_reg": loss_rpn_box_reg, } # TODO: return as dict - return sum(losses.values()) \ No newline at end of file + return sum(losses.values()) diff --git a/src/netspresso_trainer/losses/detection/yolox.py b/src/netspresso_trainer/losses/detection/yolox.py index f593cc0a..76263d6f 100644 --- a/src/netspresso_trainer/losses/detection/yolox.py +++ b/src/netspresso_trainer/losses/detection/yolox.py @@ -47,9 +47,10 @@ def __init__(self, **kwargs) -> None: super(YOLOXLoss, self).__init__() self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none") self.iou_loss = IOUloss(reduction="none") - + def forward(self, out: List, target: Dict) -> torch.Tensor: + out = out['pred'] x_shifts = [] y_shifts = [] expanded_strides = [] @@ -90,10 +91,10 @@ def forward(self, out: List, target: Dict) -> torch.Tensor: [], dtype=out[0].dtype, ) - + # TODO: return as dict return total_loss - + def get_losses( self, imgs, @@ -263,7 +264,7 @@ def get_losses( #loss_l1, num_fg / max(num_gts, 1), ) - + @torch.no_grad() def get_assignments( self, @@ -354,7 +355,7 @@ def get_assignments( matched_gt_inds, num_fg, ) - + def get_geometry_constraint( self, gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, ): @@ -385,7 +386,7 @@ def get_geometry_constraint( geometry_relation = is_in_centers[:, anchor_filter] return anchor_filter, geometry_relation - + def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask): matching_matrix = torch.zeros_like(cost, dtype=torch.uint8) @@ -419,7 +420,7 @@ def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask): fg_mask_inboxes ] return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds - + def get_output_and_grid(self, output, k, stride, dtype): grid = self.grids[k] diff --git a/src/netspresso_trainer/losses/registry.py b/src/netspresso_trainer/losses/registry.py index 61d8f554..7e45b383 100644 --- a/src/netspresso_trainer/losses/registry.py +++ b/src/netspresso_trainer/losses/registry.py @@ -1,12 +1,9 @@ -from .classification import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy from .common import CrossEntropyLoss from .detection import RoiHeadLoss, RPNLoss, YOLOXLoss from .segmentation import BoundaryLoss, PIDNetBoundaryAwareCrossEntropy, PIDNetCrossEntropy LOSS_DICT = { 'cross_entropy': CrossEntropyLoss, - 'soft_target_cross_entropy': SoftTargetCrossEntropy, - 'label_smoothing_cross_entropy': LabelSmoothingCrossEntropy, 'pidnet_cross_entropy': PIDNetCrossEntropy, 'boundary_loss': BoundaryLoss, 'pidnet_cross_entropy_with_boundary': PIDNetBoundaryAwareCrossEntropy, @@ -15,4 +12,4 @@ 'yolox_loss': YOLOXLoss, } -PHASE_LIST = ['train', 'valid', 'test'] \ No newline at end of file +PHASE_LIST = ['train', 'valid', 'test'] diff --git a/src/netspresso_trainer/losses/segmentation/pidnet.py b/src/netspresso_trainer/losses/segmentation/pidnet.py index 2bdc7cab..736ab018 100644 --- a/src/netspresso_trainer/losses/segmentation/pidnet.py +++ b/src/netspresso_trainer/losses/segmentation/pidnet.py @@ -26,7 +26,7 @@ def __init__(self, ignore_index=IGNORE_INDEX_NONE_VALUE, weight=None): self.boundary_aware = False def _forward(self, out: torch.Tensor, target: torch.Tensor): - + return self.loss_fn(out, target) def forward(self, out: Dict, target: torch.Tensor): @@ -36,7 +36,7 @@ def forward(self, out: Dict, target: torch.Tensor): filler = torch.ones_like(target) * self.ignore_index bd_label = torch.where(torch.sigmoid(extra_d[:, 0, :, :]) > 0.8, target, filler) return self._forward(pred, bd_label) - + pred, extra_p = out['pred'], out['extra_p'] score = [extra_p, pred] return sum([w * self._forward(x, target) for (w, x) in zip(BALANCE_WEIGHTS, score)]) @@ -45,7 +45,7 @@ class PIDNetBoundaryAwareCrossEntropy(PIDNetCrossEntropy): def __init__(self, ignore_index=IGNORE_INDEX_NONE_VALUE, weight=None): super().__init__(ignore_index, weight) self.boundary_aware = True - + # class OhemCrossEntropy(nn.Module): # def __init__(self, ignore_label=-1, thres=0.7, min_kept=100000, weight=None): # super(OhemCrossEntropy, self).__init__() diff --git a/src/netspresso_trainer/metrics/__init__.py b/src/netspresso_trainer/metrics/__init__.py index 330ddc64..923da55e 100644 --- a/src/netspresso_trainer/metrics/__init__.py +++ b/src/netspresso_trainer/metrics/__init__.py @@ -1 +1 @@ -from .builder import build_metrics \ No newline at end of file +from .builder import build_metrics diff --git a/src/netspresso_trainer/metrics/classification/metric.py b/src/netspresso_trainer/metrics/classification/metric.py index efa4dc29..a4126313 100644 --- a/src/netspresso_trainer/metrics/classification/metric.py +++ b/src/netspresso_trainer/metrics/classification/metric.py @@ -8,11 +8,10 @@ @torch.no_grad() -def accuracy_topk(output, target): +def accuracy_topk(pred, target): """Computes the accuracy over the k top predictions for the specified values of k""" - maxk = min(TOPK_MAX, output.size()[1]) batch_size = target.size(0) - _, pred = output.topk(maxk, 1, True, True) + maxk = pred.size(-1) pred = pred.t() correct = pred.eq(target.reshape(1, -1).expand_as(pred)) return lambda topk: correct[:min(topk, maxk)].reshape(-1).float().sum(0) * 100. / batch_size @@ -25,17 +24,6 @@ class ClassificationMetric(BaseMetric): def __init__(self, **kwargs): super().__init__() - @torch.no_grad() - @staticmethod - def accuracy_topk(output, target): - """Computes the accuracy over the k top predictions for the specified values of k""" - maxk = min(TOPK_MAX, output.size()[1]) - batch_size = target.size(0) - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.reshape(1, -1).expand_as(pred)) - return lambda topk: correct[:min(topk, maxk)].reshape(-1).float().sum(0) * 100. / batch_size - def calibrate(self, pred, target, **kwargs): result_dict = {k: 0. for k in self.metric_names} topk_callable = accuracy_topk(pred, target) diff --git a/src/netspresso_trainer/metrics/detection/metric.py b/src/netspresso_trainer/metrics/detection/metric.py index 17a83845..93e2070f 100644 --- a/src/netspresso_trainer/metrics/detection/metric.py +++ b/src/netspresso_trainer/metrics/detection/metric.py @@ -167,7 +167,7 @@ def average_precisions_per_class( class DetectionMetric(BaseMetric): metric_names: List[str] = ['map50', 'map75', 'map50_95'] primary_metric: str = 'map50_95' - + def __init__(self, **kwargs): super().__init__() diff --git a/src/netspresso_trainer/metrics/registry.py b/src/netspresso_trainer/metrics/registry.py index 381ab64f..73603e13 100644 --- a/src/netspresso_trainer/metrics/registry.py +++ b/src/netspresso_trainer/metrics/registry.py @@ -11,4 +11,4 @@ 'detection': DetectionMetric } -PHASE_LIST = ['train', 'valid', 'test'] \ No newline at end of file +PHASE_LIST = ['train', 'valid', 'test'] diff --git a/src/netspresso_trainer/metrics/segmentation/metric.py b/src/netspresso_trainer/metrics/segmentation/metric.py index fea6398e..65ca19a8 100644 --- a/src/netspresso_trainer/metrics/segmentation/metric.py +++ b/src/netspresso_trainer/metrics/segmentation/metric.py @@ -47,8 +47,7 @@ def calibrate(self, pred, target, **kwargs): result_dict = {k: AverageMeter(k) for k in self.metric_names} B = pred.size(0) - output_seg = torch.max(pred, dim=1)[1] # argmax - metrics = self.intersection_and_union_gpu(output_seg, target) + metrics = self.intersection_and_union_gpu(pred, target) result_dict['iou'].update(sum(metrics['intersection']) / (sum(metrics['union']) + 1e-10), n=B) result_dict['pixel_acc'].update(sum(metrics['intersection']) / (sum(metrics['target']) + 1e-10), n=B) diff --git a/src/netspresso_trainer/models/backbones/__init__.py b/src/netspresso_trainer/models/backbones/__init__.py index 591a70d1..03737edd 100644 --- a/src/netspresso_trainer/models/backbones/__init__.py +++ b/src/netspresso_trainer/models/backbones/__init__.py @@ -1,8 +1,9 @@ # from .core import * from .experimental.darknet import cspdarknet from .experimental.efficientformer import efficientformer -from .experimental.mobilenetv3 import mobilenetv3_small +from .experimental.mixnet import mixnet +from .experimental.mobilenetv3 import mobilenetv3 from .experimental.mobilevit import mobilevit -from .experimental.resnet import resnet50 +from .experimental.resnet import resnet from .experimental.segformer import segformer from .experimental.vit import vit diff --git a/src/netspresso_trainer/models/backbones/experimental/darknet.py b/src/netspresso_trainer/models/backbones/experimental/darknet.py index 3759c0c9..49b134e8 100644 --- a/src/netspresso_trainer/models/backbones/experimental/darknet.py +++ b/src/netspresso_trainer/models/backbones/experimental/darknet.py @@ -2,7 +2,9 @@ Based on the Darknet implementation of Megvii. https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/models/darknet.py """ +from typing import Dict, Optional, List +from omegaconf import DictConfig import torch from torch import nn @@ -14,22 +16,25 @@ class CSPDarknet(nn.Module): + def __init__( self, - task, - dep_mul, - wid_mul, - out_features=("dark3", "dark4", "dark5"), + task: str, + params: Optional[DictConfig] = None, + stage_params: Optional[List] = None, #depthwise=False, - act_type="silu", - **kwargs - ): + ) -> None: super().__init__() + out_features=("dark3", "dark4", "dark5") assert out_features, "please provide output features of Darknet" self.task = task.lower() self.use_intermediate_features = self.task in ['segmentation', 'detection'] + dep_mul = params.dep_mul + wid_mul = params.wid_mul + act_type = params.act_type + self.out_features = out_features Conv = ConvLayer @@ -147,4 +152,4 @@ def task_support(self, task): def cspdarknet(task, conf_model_backbone) -> CSPDarknet: - return CSPDarknet(task, **conf_model_backbone) + return CSPDarknet(task, conf_model_backbone.params, conf_model_backbone.stage_params) diff --git a/src/netspresso_trainer/models/backbones/experimental/efficientformer.py b/src/netspresso_trainer/models/backbones/experimental/efficientformer.py index 6ba4a5e3..23ee2ae7 100644 --- a/src/netspresso_trainer/models/backbones/experimental/efficientformer.py +++ b/src/netspresso_trainer/models/backbones/experimental/efficientformer.py @@ -5,8 +5,9 @@ import itertools import math import os -from typing import Dict, Optional +from typing import Dict, Optional, List +from omegaconf import DictConfig import torch import torch.nn as nn @@ -329,15 +330,33 @@ def forward(self, x): class EfficientFormer(MetaFormer): def __init__( - self, task, num_blocks, hidden_sizes, - num_attention_heads, attention_hidden_size, attention_dropout_prob, - attention_ratio, attention_bias_resolution, - pool_size, intermediate_ratio, hidden_dropout_prob, hidden_activation_type, - layer_norm_eps, - drop_path_rate=0., use_layer_scale=True, layer_scale_init_value=1e-5, - downsamples=None, down_patch_size=3, down_stride=2, down_pad=1, - vit_num=1, **kwargs - ): + self, + task: str, + params: Optional[DictConfig] = None, + stage_params: Optional[List] = None, + ) -> None: + + num_blocks = [stage.num_blocks for stage in stage_params] + hidden_sizes = [stage.hidden_sizes for stage in stage_params] + downsamples = [stage.downsamples for stage in stage_params] + + num_attention_heads = params.num_attention_heads + attention_hidden_size = params.attention_hidden_size + attention_dropout_prob = params.attention_dropout_prob + attention_ratio = params.attention_ratio + attention_bias_resolution = params.attention_bias_resolution + pool_size = params.pool_size + intermediate_ratio = params.intermediate_ratio + hidden_dropout_prob = params.hidden_dropout_prob + hidden_activation_type = params.hidden_activation_type + layer_norm_eps = params.layer_norm_eps + drop_path_rate = params.drop_path_rate + use_layer_scale = params.use_layer_scale + layer_scale_init_value = params.layer_scale_init_value + down_patch_size = params.down_patch_size + down_stride = params.down_stride + down_pad = params.down_pad + vit_num = params.vit_num super().__init__(hidden_sizes) self.task = task.lower() @@ -374,4 +393,4 @@ def forward(self, x): def efficientformer(task, conf_model_backbone) -> EfficientFormer: - return EfficientFormer(task, **conf_model_backbone) + return EfficientFormer(task, conf_model_backbone.params, conf_model_backbone.stage_params) diff --git a/src/netspresso_trainer/models/backbones/experimental/mixnet.py b/src/netspresso_trainer/models/backbones/experimental/mixnet.py new file mode 100644 index 00000000..ba4c57b6 --- /dev/null +++ b/src/netspresso_trainer/models/backbones/experimental/mixnet.py @@ -0,0 +1,286 @@ +""" +Based on the publicly available MixNet-PyTorch repository. +https://github.com/romulus0914/MixNet-PyTorch/blob/master/mixnet.py +""" +from collections import OrderedDict +import math +from typing import Dict, List, Optional + +from omegaconf import DictConfig +import torch +from torch import nn +from torch.nn import functional as F +from torchvision.ops.misc import SqueezeExcitation as SEBlock + +from ...op.registry import ACTIVATION_REGISTRY +from ...op.custom import ConvLayer +from ...utils import BackboneOutput + + +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # +# GPConv: Grouped Point-wise Convolution for MixDepthBlock +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # +class GPConv(nn.Module): + def __init__(self, in_planes, out_planes, kernel_sizes): + super(GPConv, self).__init__() + self.num_groups = len(kernel_sizes) + assert in_planes % self.num_groups == 0 + sub_in_dim = in_planes // self.num_groups + sub_out_dim = out_planes // self.num_groups + + self.group_point_wise = nn.ModuleList() + for _ in kernel_sizes: + self.group_point_wise.append(nn.Conv2d(sub_in_dim, sub_out_dim, + kernel_size=1, stride=1, padding=0, + groups=1, dilation=1, bias=False)) + + def forward(self, x): + if self.num_groups == 1: + return self.group_point_wise[0](x) + + chunks = torch.chunk(x, chunks=self.num_groups, dim=1) + mix = [self.group_point_wise[stream](chunks[stream]) for stream in range(self.num_groups)] + return torch.cat(mix, dim=1) + + +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # +# MDConv: Mixed Depth-wise Convolution for MixDepthBlock +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # +class MDConv(nn.Module): + def __init__(self, in_planes, kernel_sizes, stride=1, dilate=1): + super(MDConv, self).__init__() + self.num_groups = len(kernel_sizes) + assert in_planes % self.num_groups == 0 + sub_hidden_dim = in_planes // self.num_groups + + assert stride in [1, 2] + dilate = 1 if stride > 1 else dilate + + self.mixed_depth_wise = nn.ModuleList() + for kernel_size in kernel_sizes: + padding = ((kernel_size - 1) // 2) * dilate + self.mixed_depth_wise.append(nn.Conv2d(sub_hidden_dim, sub_hidden_dim, + kernel_size=kernel_size, stride=stride, padding=padding, + groups=sub_hidden_dim, dilation=dilate, bias=False)) + + def forward(self, x): + if self.num_groups == 1: + return self.mixed_depth_wise[0](x) + + chunks = torch.chunk(x, chunks=self.num_groups, dim=1) + mix = [self.mixed_depth_wise[stream](chunks[stream]) for stream in range(self.num_groups)] + return torch.cat(mix, dim=1) + + +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # +# MixDepthBlock: MixDepthBlock for MixNet +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # +class MixDepthBlock(nn.Module): + def __init__(self, in_planes, out_planes, + expand_ratio, exp_kernel_sizes, kernel_sizes, poi_kernel_sizes, stride, dilate, + reduction_ratio=4, dropout_rate=0.2, act_type="swish"): + super(MixDepthBlock, self).__init__() + self.dropout_rate = dropout_rate + self.expand_ratio = expand_ratio + self.out_channels = out_planes + + self.groups = len(kernel_sizes) + self.use_se = (reduction_ratio is not None) and (reduction_ratio > 1) + self.use_residual = in_planes == out_planes and stride == 1 + + assert stride in [1, 2] + dilate = 1 if stride > 1 else dilate + hidden_dim = in_planes * expand_ratio + + # step 1. Expansion phase/Point-wise convolution + if expand_ratio != 1: + self.expansion = nn.Sequential(OrderedDict([ + ("conv", GPConv(in_planes, hidden_dim, kernel_sizes=exp_kernel_sizes)), + ("norm", nn.BatchNorm2d(hidden_dim, eps=1e-3, momentum=0.01)), + ("act", ACTIVATION_REGISTRY[act_type]()) + ])) + + # step 2. Depth-wise convolution phase + self.depth_wise = nn.Sequential(OrderedDict([ + ("conv", MDConv(hidden_dim, kernel_sizes=kernel_sizes, stride=stride, dilate=dilate)), + ("norm", nn.BatchNorm2d(hidden_dim, eps=1e-3, momentum=0.01)), + ("act", ACTIVATION_REGISTRY[act_type]()) + ])) + + # step 3. Squeeze and Excitation + if self.use_se: + reduced_dim = max(1, int(in_planes / reduction_ratio)) + self.se_block = SEBlock(input_channels=hidden_dim, squeeze_channels=reduced_dim, activation=ACTIVATION_REGISTRY[act_type]) + + # step 4. Point-wise convolution phase + self.point_wise = nn.Sequential(OrderedDict([ + ("conv", GPConv(hidden_dim, out_planes, kernel_sizes=poi_kernel_sizes)), + ("norm", nn.BatchNorm2d(out_planes, eps=1e-3, momentum=0.01)) + ])) + + def forward(self, x): + res = x + + # step 1. Expansion phase/Point-wise convolution + if self.expand_ratio != 1: + x = self.expansion(x) + + # step 2. Depth-wise convolution phase + x = self.depth_wise(x) + + # step 3. Squeeze and Excitation + if self.use_se: + x = self.se_block(x) + + # step 4. Point-wise convolution phase + x = self.point_wise(x) + + # step 5. Skip connection and drop connect + if self.use_residual: + if self.training and (self.dropout_rate is not None): + x = F.dropout2d(input=x, p=self.dropout_rate, + training=self.training, ) + x = x + res + + return x + + +class MixNet(nn.Module): + def __init__( + self, + task: str, + params: Optional[DictConfig] = None, + stage_params: Optional[List] = None, + ): + super(MixNet, self).__init__() + self.task = task.lower() + self.use_intermediate_features = self.task in ['segmentation', 'detection'] + + stem_planes = params.stem_planes + width_multi = params.width_multi + depth_multi = params.depth_multi + self.dropout_rate = params.dropout_rate + + out_channels = self._round_filters(stem_planes, width_multi) + self.mod1 = ConvLayer(in_channels=3, out_channels=out_channels, kernel_size=3, + stride=2, groups=1, dilation=1, act_type="relu") + + in_channels = out_channels + drop_rate = self.dropout_rate + stages: List[nn.Module] = [] + for stg_idx, stage_info in enumerate(stage_params): + + stage: List[nn.Module] = [] + for block in zip(stage_info.expand_ratio, stage_info.out_channels, stage_info.num_blocks, + stage_info.kernel_sizes, stage_info.exp_kernel_sizes, stage_info.poi_kernel_sizes, + stage_info.stride, stage_info.dilation, stage_info.act_type, stage_info.se_reduction_ratio): + t, c, n, k, ek, pk, s, d, a, se = block + out_channels = self._round_filters(c, width_multi) + repeats = self._round_repeats(n, depth_multi) + + for block_id in range(repeats): + stride = s if block_id == 0 else 1 + dilate = d if stride == 1 else 1 + + stage.append(MixDepthBlock(in_channels, out_channels, + expand_ratio=t, exp_kernel_sizes=ek, + kernel_sizes=k, poi_kernel_sizes=pk, + stride=stride, dilate=dilate, + reduction_ratio=se, + dropout_rate=drop_rate, + act_type=a)) + + in_channels = out_channels + + # add last conv + if stg_idx == len(stage_params) - 1: + self.last_channels = 1536 + stage.append( + ConvLayer(in_channels=in_channels, + out_channels=self.last_channels, + kernel_size=1, + stride=1, + groups=1, + dilation=1, + act_type="relu") + ) + + stage = nn.Sequential(*stage) + stages.append(stage) + + self.stages = nn.ModuleList(stages) + self.avgpool = nn.AdaptiveAvgPool2d(1) + + self._feature_dim = self.last_channels + self._intermediate_features_dim = [s[-1].out_channels for s in self.stages[:-1]] + self._intermediate_features_dim += [self.last_channels] + + self._initialize_weights() + + def _initialize_weights(self): + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + fan_out = m.weight.size(0) + init_range = 1.0 / math.sqrt(fan_out) + nn.init.uniform_(m.weight, -init_range, init_range) + if m.bias is not None: + nn.init.zeros_(m.bias) + + @staticmethod + def _make_divisible(value, divisor=8): + new_value = max(divisor, int(value + divisor / 2) // divisor * divisor) + if new_value < 0.9 * value: + new_value += divisor + return new_value + + def _round_filters(self, filters, width_multi): + if width_multi == 1.0: + return filters + return int(self._make_divisible(filters * width_multi)) + + @staticmethod + def _round_repeats(repeats, depth_multi): + if depth_multi == 1.0: + return repeats + return int(math.ceil(depth_multi * repeats)) + + @property + def feature_dim(self): + return self._feature_dim + + @property + def intermediate_features_dim(self): + return self._intermediate_features_dim + + def forward(self, x): + x = self.mod1(x) + + all_hidden_states = () if self.use_intermediate_features else None + for stage in self.stages: + x = stage(x) + if self.use_intermediate_features: + all_hidden_states = all_hidden_states + (x, ) + + if self.use_intermediate_features: + return BackboneOutput(intermediate_features=all_hidden_states) + + x = self.avgpool(x) + x = torch.flatten(x, 1) + + if self.training and (self.dropout_rate is not None): + x = F.dropout(input=x, p=self.dropout_rate, + training=self.training, ) + + return BackboneOutput(last_feature=x) + + +def mixnet(task, conf_model_backbone) -> MixNet: + return MixNet(task, conf_model_backbone.params, conf_model_backbone.stage_params) \ No newline at end of file diff --git a/src/netspresso_trainer/models/backbones/experimental/mobilenetv3.py b/src/netspresso_trainer/models/backbones/experimental/mobilenetv3.py index d87b9695..f1c1e486 100644 --- a/src/netspresso_trainer/models/backbones/experimental/mobilenetv3.py +++ b/src/netspresso_trainer/models/backbones/experimental/mobilenetv3.py @@ -2,8 +2,9 @@ Based on the Torchvision implementation of MobileNetV3. https://pytorch.org/vision/main/_modules/torchvision/models/mobilenetv3.html """ -from typing import List +from typing import List, Dict, Optional +from omegaconf import DictConfig import torch import torch.nn as nn from torch import Tensor @@ -11,25 +12,18 @@ from ...op.custom import ConvLayer, InvertedResidual from ...utils import BackboneOutput -__all__ = ['mobilenetv3_small'] +__all__ = ['mobilenetv3'] SUPPORTING_TASK = ['classification', 'segmentation'] -def list_depth(block_info): - if isinstance(block_info[0], list): - return 1 + list_depth(block_info[0]) - else: - return 1 - - class MobileNetV3(nn.Module): def __init__( self, task: str, - block_info, # [in_channels, kernel, expended_channels, out_channels, use_se, activation, stride, dilation] - **kwargs + params: Optional[DictConfig] = None, + stage_params: Optional[List] = None, ) -> None: super(MobileNetV3, self).__init__() @@ -39,7 +33,7 @@ def __init__( act_type = 'hard_swish' # building first layer - firstconv_output_channels = block_info[0][0][0] + firstconv_output_channels = stage_params[0].in_channels[0] self.conv_first = ConvLayer( in_channels=3, out_channels=firstconv_output_channels, @@ -52,20 +46,16 @@ def __init__( # building inverted residual blocks stages: List[nn.Module] = [] - lastconv_input_channels = block_info[-1][-1][3] + lastconv_input_channels = stage_params[-1].out_channels[-1] lastconv_output_channels = 6 * lastconv_input_channels - for stg_idx, stage_info in enumerate(block_info): + for stg_idx, stage_info in enumerate(stage_params): stage: List[nn.Module] = [] - for block in stage_info: - in_channels = block[0] - kernel_size = block[1] - hidden_channels = block[2] - out_channels = block[3] - use_se = block[4] - act_type_b = block[5].lower() - stride = block[6] - dilation = block[7] + for block in zip(stage_info.in_channels, stage_info.kernel, stage_info.expanded_channels, + stage_info.out_channels, stage_info.use_se, stage_info.activation, + stage_info.stride, stage_info.dilation): + in_channels, kernel_size, hidden_channels, out_channels, use_se, act_type_b, stride, dilation = block + act_type_b = act_type_b.lower() stage.append( InvertedResidual(in_channels=in_channels, hidden_channels=hidden_channels, @@ -79,7 +69,7 @@ def __init__( ) # add last conv - if stg_idx == len(block_info) - 1: + if stg_idx == len(stage_params) - 1: stage.append( ConvLayer(in_channels=lastconv_input_channels, out_channels=lastconv_output_channels, @@ -140,5 +130,5 @@ def task_support(self, task): return task.lower() in SUPPORTING_TASK -def mobilenetv3_small(task, conf_model_backbone) -> MobileNetV3: - return MobileNetV3(task, **conf_model_backbone) +def mobilenetv3(task, conf_model_backbone) -> MobileNetV3: + return MobileNetV3(task, conf_model_backbone.params, conf_model_backbone.stage_params) diff --git a/src/netspresso_trainer/models/backbones/experimental/mobilevit.py b/src/netspresso_trainer/models/backbones/experimental/mobilevit.py index 1adbd28e..5a8ed0e9 100644 --- a/src/netspresso_trainer/models/backbones/experimental/mobilevit.py +++ b/src/netspresso_trainer/models/backbones/experimental/mobilevit.py @@ -5,8 +5,9 @@ import argparse import math -from typing import Any, Dict, Literal, Optional, Tuple, Union +from typing import Any, Dict, Literal, Optional, Tuple, Union, List +from omegaconf import DictConfig import torch import torch.nn as nn import torch.nn.functional as F @@ -252,27 +253,38 @@ def forward( return out class MobileViTEncoder(MetaFormerEncoder): - def __init__(self, out_channels, block_type, num_blocks, stride, hidden_size, intermediate_size, num_transformer_blocks, dilate, expand_ratio, - patch_embedding_out_channels, local_kernel_size, patch_size, - num_attention_heads, attention_dropout_prob, hidden_dropout_prob, layer_norm_eps, use_fusion_layer) -> None: + def __init__( + self, + params: Optional[DictConfig] = None, + stage_params: Optional[List] = None, + ) -> None: super().__init__() stages = [] self.dilation = 1 - self.local_kernel_size = local_kernel_size - self.patch_size = patch_size - self.num_attention_heads = num_attention_heads - self.attention_dropout_prob = attention_dropout_prob - self.hidden_dropout_prob = hidden_dropout_prob - self.layer_norm_eps = layer_norm_eps - self.use_fusion_layer = use_fusion_layer + self.local_kernel_size = params.local_kernel_size + self.patch_size = params.patch_size + self.num_attention_heads = params.num_attention_heads + self.attention_dropout_prob = params.attention_dropout_prob + self.hidden_dropout_prob = params.hidden_dropout_prob + self.layer_norm_eps = params.layer_norm_eps + self.use_fusion_layer = params.use_fusion_layer - in_channels = patch_embedding_out_channels - for idx in range(len(out_channels)): - stages.append(self._make_block(out_channels[idx], block_type[idx], num_blocks[idx], stride[idx], hidden_size[idx], - intermediate_size[idx], num_transformer_blocks[idx], dilate[idx], expand_ratio[idx], + in_channels = params.patch_embedding_out_channels + for stage in stage_params: + out_channels = stage.out_channels + block_type = stage.block_type + num_blocks = stage.num_blocks + stride = stage.stride + hidden_size = stage.hidden_size + intermediate_size = stage.intermediate_size + num_transformer_blocks = stage.num_transformer_blocks + dilate = stage.dilate + expand_ratio = stage.expand_ratio + stages.append(self._make_block(out_channels, block_type, num_blocks, stride, hidden_size, + intermediate_size, num_transformer_blocks, dilate, expand_ratio, in_channels)) - in_channels = out_channels[idx] + in_channels = out_channels self.blocks = nn.Sequential(*stages) def _make_block(self, out_channels, block_type: Literal['mv2', 'mobilevit'], num_blocks, stride, hidden_size, intermediate_size, num_transformer_blocks, dilate, expand_ratio, in_channels): @@ -346,26 +358,23 @@ def _make_mobilevit_blocks(self, num_transformer_blocks, in_channels, out_channe class MobileViT(MetaFormer): def __init__( - self, task, - out_channels, block_type, num_blocks, stride, hidden_size, intermediate_size, num_transformer_blocks, dilate, expand_ratio, - patch_embedding_out_channels, local_kernel_size, patch_size, - num_attention_heads, attention_dropout_prob, hidden_dropout_prob, - exp_factor, layer_norm_eps=1e-6, use_fusion_layer = True, - **kwargs + self, + task: str, + params: Optional[DictConfig] = None, + stage_params: Optional[List] = None, ) -> None: - exp_channels = min(exp_factor * out_channels[-1], 960) - hidden_sizes = out_channels + [exp_channels] + exp_channels = min(params.exp_factor * stage_params[-1].out_channels, 960) + hidden_sizes = [stage.out_channels for stage in stage_params] + [exp_channels] super().__init__(hidden_sizes) self.task = task self.intermediate_features = self.task in ['segmentation', 'detection'] image_channels = 3 - self.patch_embed = MobileViTEmbeddings(image_channels, patch_embedding_out_channels) - self.encoder = MobileViTEncoder(out_channels, block_type, num_blocks, stride, hidden_size, intermediate_size, num_transformer_blocks, dilate, expand_ratio, - patch_embedding_out_channels, local_kernel_size, patch_size, num_attention_heads, attention_dropout_prob, hidden_dropout_prob, layer_norm_eps, use_fusion_layer) + self.patch_embed = MobileViTEmbeddings(image_channels, params.patch_embedding_out_channels) + self.encoder = MobileViTEncoder(params=params, stage_params=stage_params) - self.conv_1x1_exp = ConvLayer(in_channels=out_channels[-1], out_channels=exp_channels, + self.conv_1x1_exp = ConvLayer(in_channels=stage_params[-1].out_channels, out_channels=exp_channels, kernel_size=1, stride=1, use_act=True, use_norm=True, act_type='silu') self.pool = GlobalPool(pool_type="mean", keep_dim=False) @@ -380,4 +389,4 @@ def forward(self, x: FXTensorType): return BackboneOutput(last_feature=feat) def mobilevit(task, conf_model_backbone): - return MobileViT(task, **conf_model_backbone) \ No newline at end of file + return MobileViT(task, conf_model_backbone.params, conf_model_backbone.stage_params) \ No newline at end of file diff --git a/src/netspresso_trainer/models/backbones/experimental/resnet.py b/src/netspresso_trainer/models/backbones/experimental/resnet.py index 77baeb5f..1aefbb0b 100644 --- a/src/netspresso_trainer/models/backbones/experimental/resnet.py +++ b/src/netspresso_trainer/models/backbones/experimental/resnet.py @@ -4,6 +4,7 @@ """ from typing import Dict, List, Literal, Optional, Type, Union +from omegaconf import DictConfig import torch import torch.nn as nn from torch import Tensor @@ -11,7 +12,7 @@ from ...op.custom import BasicBlock, Bottleneck, ConvLayer from ...utils import BackboneOutput -__all__ = ['resnet50'] +__all__ = ['resnet'] SUPPORTING_TASK = ['classification', 'segmentation'] @@ -26,18 +27,18 @@ class ResNet(nn.Module): def __init__( self, task: str, - block: Literal['basicblock', 'bottleneck'], - layers: List[int], - zero_init_residual: bool = False, - groups: int = 1, - width_per_group: int = 64, - replace_stride_with_dilation: Optional[List[bool]] = None, - norm_layer: Optional[str] = None, - expansion: Optional[int] = None, - **kwargs + params: Optional[DictConfig] = None, + stage_params: Optional[List] = None, ) -> None: super(ResNet, self).__init__() + block: Literal['basicblock', 'bottleneck'] = params.block + zero_init_residual: bool = params.zero_init_residual + groups: int = params.groups + width_per_group: int = params.width_per_group + norm_layer: Optional[str] = params.norm_layer + expansion: Optional[int] = params.expansion + self.task = task.lower() block = BLOCK_FROM_LITERAL[block.lower()] self.use_intermediate_features = self.task in ['segmentation', 'detection'] @@ -48,13 +49,9 @@ def __init__( self.inplanes = 64 self.dilation = 1 - if replace_stride_with_dilation is None: - # each element in the tuple indicates if we should replace - # the 2x2 stride with a dilated convolution instead - replace_stride_with_dilation = [False, False, False] - if len(replace_stride_with_dilation) != 3: - raise ValueError("replace_stride_with_dilation should be None " - "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + for i in range(1, len(stage_params)): + if 'replace_stride_with_dilation' not in stage_params[i]: + stage_params[i]['replace_stride_with_dilation'] = False self.groups = groups self.base_width = width_per_group @@ -64,22 +61,23 @@ def __init__( self.conv1 = ConvLayer(in_channels=3, out_channels=self.inplanes, kernel_size=7, stride=2, padding=3, bias=False, norm_type='batch_norm', act_type='relu') - - planes = [64, 128, 256, 512] self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - self.layer1 = self._make_layer(block, planes[0], layers[0], expansion=expansion) - self.layer2 = self._make_layer(block, planes[1], layers[1], stride=2, - dilate=replace_stride_with_dilation[0], - expansion=expansion) - self.layer3 = self._make_layer(block, planes[2], layers[2], stride=2, - dilate=replace_stride_with_dilation[1], - expansion=expansion) - self.layer4 = self._make_layer(block, planes[3], layers[3], stride=2, - dilate=replace_stride_with_dilation[2], - expansion=expansion) + + stages: List[nn.Module] = [] + + first_stage = stage_params[0] + layer = self._make_layer(block, first_stage['plane'], first_stage['layers'], expansion=expansion) + stages.append(layer) + for stage in stage_params[1:]: + layer = self._make_layer(block, stage['plane'], stage['layers'], stride=2, + dilate=stage['replace_stride_with_dilation'], + expansion=expansion) + stages.append(layer) + + self.stages = nn.ModuleList(stages) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - hidden_sizes = [h * 4 for h in planes] + hidden_sizes = [stage['plane'] * expansion for stage in stage_params] self._feature_dim = hidden_sizes[-1] self._intermediate_features_dim = hidden_sizes @@ -134,8 +132,8 @@ def forward(self, x: Tensor): x = self.maxpool(x) all_hidden_states = () if self.use_intermediate_features else None - for layer in [self.layer1, self.layer2, self.layer3, self.layer4]: - x = layer(x) + for stage in self.stages: + x = stage(x) if self.use_intermediate_features: all_hidden_states = all_hidden_states + (x,) @@ -160,8 +158,8 @@ def task_support(self, task): return task.lower() in SUPPORTING_TASK -def resnet50(task, conf_model_backbone) -> ResNet: +def resnet(task, conf_model_backbone) -> ResNet: """ - ResNet-50 model from "Deep Residual Learning for Image Recognition" https://arxiv.org/pdf/1512.03385.pdf. + ResNet model from "Deep Residual Learning for Image Recognition" https://arxiv.org/pdf/1512.03385.pdf. """ - return ResNet(task, **conf_model_backbone) + return ResNet(task, conf_model_backbone.params, conf_model_backbone.stage_params) diff --git a/src/netspresso_trainer/models/backbones/experimental/segformer.py b/src/netspresso_trainer/models/backbones/experimental/segformer.py index 9fb55c87..8dee0d82 100644 --- a/src/netspresso_trainer/models/backbones/experimental/segformer.py +++ b/src/netspresso_trainer/models/backbones/experimental/segformer.py @@ -1,6 +1,7 @@ import math -from typing import Optional +from typing import Optional, List, Dict +from omegaconf import DictConfig import torch import torch.nn as nn @@ -135,42 +136,59 @@ def forward(self, x, height, width): class SegFormer(MetaFormer): - def __init__(self, task, num_modules, num_blocks, embedding_patch_sizes, embedding_strides, hidden_sizes, - num_attention_heads, attention_dropout_prob, sr_ratios, - intermediate_ratio, hidden_dropout_prob, hidden_activation_type, layer_norm_eps, - **kwargs): - super().__init__(hidden_sizes) + def __init__( + self, + task: str, + params: Optional[DictConfig] = None, + stage_params: Optional[List] = None, + ) -> None: + super().__init__([stage.hidden_sizes for stage in stage_params]) self.task = task self.use_intermediate_features = self.task in ['segmentation', 'detection'] - image_channels = 3 + intermediate_ratio = params.intermediate_ratio + hidden_activation_type = params.hidden_activation_type + hidden_dropout_prob = params.hidden_dropout_prob + attention_dropout_prob = params.attention_dropout_prob + layer_norm_eps = params.layer_norm_eps + + in_channels = 3 self.encoder_modules = nn.ModuleList() - for i in range(num_modules): + for blocks in stage_params: + num_blocks = blocks.num_blocks + sr_ratios = blocks.sr_ratios + hidden_sizes = blocks.hidden_sizes + embedding_patch_sizes = blocks.embedding_patch_sizes + embedding_strides = blocks.embedding_strides + num_attention_heads = blocks.num_attention_heads + module = nn.ModuleDict( { 'patch_embed': SegformerOverlapPatchEmbeddings( - embedding_patch_sizes[i], - embedding_strides[i], - image_channels if i == 0 else hidden_sizes[i - 1], - hidden_sizes[i] + embedding_patch_sizes, + embedding_strides, + in_channels, + hidden_sizes ), 'encoder': SegformerEncoder( - num_blocks[i], - hidden_sizes[i], - num_attention_heads[i], + num_blocks, + hidden_sizes, + num_attention_heads, attention_dropout_prob, - sr_ratios[i], + sr_ratios, intermediate_ratio, hidden_dropout_prob, hidden_activation_type, layer_norm_eps ), - 'norm': nn.LayerNorm(hidden_sizes[i]) + 'norm': nn.LayerNorm(hidden_sizes) } ) self.encoder_modules.append(module) + in_channels = hidden_sizes + def forward(self, x): B = x.size(0) all_hidden_states = () if self.use_intermediate_features else None @@ -194,4 +212,4 @@ def forward(self, x): def segformer(task, conf_model_backbone) -> SegformerEncoder: - return SegFormer(task, **conf_model_backbone) + return SegFormer(task, conf_model_backbone.params, conf_model_backbone.stage_params) diff --git a/src/netspresso_trainer/models/backbones/experimental/vit.py b/src/netspresso_trainer/models/backbones/experimental/vit.py index f5344f20..31cedd8a 100644 --- a/src/netspresso_trainer/models/backbones/experimental/vit.py +++ b/src/netspresso_trainer/models/backbones/experimental/vit.py @@ -3,8 +3,9 @@ https://github.com/apple/ml-cvnets/blob/84d992f413e52c0468f86d23196efd9dad885e6f/cvnets/models/classification/vit.py """ import argparse -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Union, List +from omegaconf import DictConfig import torch import torch.nn as nn @@ -93,19 +94,21 @@ def __init__(self, num_blocks, hidden_size, num_attention_heads, attention_dropo class VisionTransformer(MetaFormer): def __init__( self, - task, - patch_size, - hidden_size, - num_blocks, - num_attention_heads, - attention_dropout_prob, - intermediate_size, - hidden_dropout_prob, - layer_norm_eps=1e-6, - use_cls_token=True, - vocab_size=1000, - **kwargs + task: str, + params: Optional[DictConfig] = None, + stage_params: Optional[List] = None, ) -> None: + patch_size = params.patch_size + hidden_size = params.hidden_size + num_blocks = params.num_blocks + num_attention_heads = params.num_attention_heads + attention_dropout_prob = params.attention_dropout_prob + intermediate_size = params.intermediate_size + hidden_dropout_prob = params.hidden_dropout_prob + layer_norm_eps = params.layer_norm_eps + use_cls_token = params.use_cls_token + vocab_size = params.vocab_size + hidden_sizes = hidden_size if isinstance(hidden_size, list) else [hidden_size] * num_blocks super().__init__(hidden_sizes) self.task = task @@ -119,4 +122,4 @@ def __init__( def vit(task, conf_model_backbone): # ViT tiny - return VisionTransformer(task, **conf_model_backbone) \ No newline at end of file + return VisionTransformer(task, conf_model_backbone.params, conf_model_backbone.stage_params) \ No newline at end of file diff --git a/src/netspresso_trainer/models/base.py b/src/netspresso_trainer/models/base.py index ec341eb2..a063dae5 100644 --- a/src/netspresso_trainer/models/base.py +++ b/src/netspresso_trainer/models/base.py @@ -7,7 +7,7 @@ import torch.nn as nn from omegaconf import OmegaConf -from .registry import MODEL_BACKBONE_DICT, MODEL_HEAD_DICT +from .registry import MODEL_BACKBONE_DICT, MODEL_HEAD_DICT, MODEL_NECK_DICT from .utils import BackboneOutput, DetectionModelOutput, ModelOutput, load_from_checkpoint logger = logging.getLogger("netspresso_trainer") @@ -22,18 +22,24 @@ def __init__(self, conf_model, task, backbone_name, head_name, num_classes, mode self.head_name = head_name backbone_fn: Callable[..., nn.Module] = MODEL_BACKBONE_DICT[backbone_name] - conf_model_backbone = OmegaConf.to_object(conf_model.architecture.backbone) - self.backbone: nn.Module = backbone_fn(task=self.task, conf_model_backbone=conf_model_backbone) + self.backbone: nn.Module = backbone_fn(task=self.task, conf_model_backbone=conf_model.architecture.backbone) self.backbone = load_from_checkpoint(self.backbone, model_checkpoint) + intermediate_features_dim = self.backbone.intermediate_features_dim + if getattr(conf_model.architecture, 'neck', None): + neck_name = conf_model.architecture.neck.name + neck_fn: Callable[..., nn.Module] = MODEL_NECK_DICT[neck_name] + self.neck = neck_fn(intermediate_features_dim=self.backbone.intermediate_features_dim) + intermediate_features_dim = self.neck.intermediate_features_dim + head_module = MODEL_HEAD_DICT[self.task][head_name] if task == 'classification': self.head = head_module(num_classes=num_classes, feature_dim=self.backbone.feature_dim) elif task in ['segmentation', 'detection']: img_size = img_size if isinstance(img_size, (int, None)) else tuple(img_size) self.head = head_module(num_classes=num_classes, - intermediate_features_dim=self.backbone.intermediate_features_dim, + intermediate_features_dim=intermediate_features_dim, label_size=img_size) if freeze_backbone: @@ -74,6 +80,8 @@ def __init__(self, conf_model, task, backbone_name, head_name, num_classes, mode def forward(self, x, label_size=None, targets=None): features: BackboneOutput = self.backbone(x) + if self.neck: + features: BackboneOutput = self.neck(features['intermediate_features']) out: ModelOutput = self.head(features['intermediate_features']) return out @@ -85,5 +93,7 @@ def __init__(self, conf_model, task, backbone_name, head_name, num_classes, mode def forward(self, x, label_size=None, targets=None): features: BackboneOutput = self.backbone(x) + if self.neck: + features: BackboneOutput = self.neck(features['intermediate_features']) out: DetectionModelOutput = self.head(features['intermediate_features']) return out diff --git a/src/netspresso_trainer/models/builder.py b/src/netspresso_trainer/models/builder.py index 78e9d13f..6c9202a2 100644 --- a/src/netspresso_trainer/models/builder.py +++ b/src/netspresso_trainer/models/builder.py @@ -16,14 +16,14 @@ def load_full_model(conf_model, model_name, num_classes, model_checkpoint): model_fn: Callable[..., nn.Module] = MODEL_FULL_DICT[model_name] - conf_model_full = OmegaConf.to_object(conf_model.architecture.full) - model: nn.Module = model_fn(num_classes=num_classes, conf_model_full=conf_model_full) + model: nn.Module = model_fn(num_classes=num_classes, conf_model_full=conf_model.architecture.full) model = load_from_checkpoint(model, model_checkpoint) return model -def load_backbone_and_head_model(conf_model, task, backbone_name, head_name, num_classes, model_checkpoint, img_size, freeze_backbone): +def load_backbone_and_head_model( + conf_model, task, backbone_name, head_name, num_classes, model_checkpoint, img_size, freeze_backbone): TASK_MODEL_DICT: Dict[str, Type[TaskModel]] = { 'classification': ClassificationModel, 'segmentation': SegmentationModel, @@ -31,9 +31,11 @@ def load_backbone_and_head_model(conf_model, task, backbone_name, head_name, num } if task not in TASK_MODEL_DICT: - raise ValueError(f"No such task(s) named: {task}. This should be included in SUPPORTING_TASK_LIST ({SUPPORTING_TASK_LIST})") + raise ValueError( + f"No such task(s) named: {task}. This should be included in SUPPORTING_TASK_LIST ({SUPPORTING_TASK_LIST})") - return TASK_MODEL_DICT[task](conf_model, task, backbone_name, head_name, num_classes, model_checkpoint, img_size, freeze_backbone) + return TASK_MODEL_DICT[task]( + conf_model, task, backbone_name, head_name, num_classes, model_checkpoint, img_size, freeze_backbone) def build_model(conf_model, task, num_classes, model_checkpoint, img_size) -> nn.Module: @@ -45,4 +47,5 @@ def build_model(conf_model, task, num_classes, model_checkpoint, img_size) -> nn backbone_name = str(conf_model.architecture.backbone.name).lower() head_name = str(conf_model.architecture.head.name).lower() freeze_backbone = conf_model.freeze_backbone - return load_backbone_and_head_model(conf_model, task, backbone_name, head_name, num_classes, model_checkpoint, img_size, freeze_backbone) + return load_backbone_and_head_model( + conf_model, task, backbone_name, head_name, num_classes, model_checkpoint, img_size, freeze_backbone) diff --git a/src/netspresso_trainer/models/full/experimental/pidnet.py b/src/netspresso_trainer/models/full/experimental/pidnet.py index b3b630c9..10d3ac04 100644 --- a/src/netspresso_trainer/models/full/experimental/pidnet.py +++ b/src/netspresso_trainer/models/full/experimental/pidnet.py @@ -3,6 +3,7 @@ # ------------------------------------------------------------------------------ import logging import time +from typing import Optional, List, Dict import torch import torch.nn as nn @@ -17,8 +18,19 @@ class PIDNet(nn.Module): - def __init__(self, num_classes=19, m=2, n=3, planes=64, ppm_planes=96, head_planes=128, is_training=True, **kwargs): + def __init__( + self, + params: Optional[Dict] = None + ) -> None: super(PIDNet, self).__init__() + num_classes = params.num_classes + m = params.m + n = params.n + planes = params.planes + ppm_planes = params.ppm_planes + head_planes = params.head_planes + is_training = params.is_training + self.is_training = is_training # I Branch @@ -195,5 +207,6 @@ def forward(self, x: FXTensorType, label_size=None) -> PIDNetModelOutput: def pidnet(num_classes: int, conf_model_full) -> PIDNet: # PIDNet-S - return PIDNet(num_classes=num_classes, is_training=True, **conf_model_full) - + conf_model_full.num_classes = num_classes + conf_model_full.is_training = True + return PIDNet(params=conf_model_full) \ No newline at end of file diff --git a/src/netspresso_trainer/models/heads/detection/__init__.py b/src/netspresso_trainer/models/heads/detection/__init__.py index 455a00ee..8d362011 100644 --- a/src/netspresso_trainer/models/heads/detection/__init__.py +++ b/src/netspresso_trainer/models/heads/detection/__init__.py @@ -1,2 +1,2 @@ from .experimental.faster_rcnn import faster_rcnn -from .experimental.yolo_head import yolo_head \ No newline at end of file +from .experimental.yolox_head import yolox_head \ No newline at end of file diff --git a/src/netspresso_trainer/models/heads/detection/experimental/detection/generalized_rcnn.py b/src/netspresso_trainer/models/heads/detection/experimental/detection/generalized_rcnn.py index 54095e8a..70436fcd 100644 --- a/src/netspresso_trainer/models/heads/detection/experimental/detection/generalized_rcnn.py +++ b/src/netspresso_trainer/models/heads/detection/experimental/detection/generalized_rcnn.py @@ -19,19 +19,15 @@ class GeneralizedRCNN(nn.Module): detections / masks from it. """ - def __init__(self, neck:nn.Module, rpn: nn.Module, roi_heads: nn.Module, image_size: Tuple[int, int]) -> None: + def __init__(self, rpn: nn.Module, roi_heads: nn.Module, image_size: Tuple[int, int]) -> None: super().__init__() # _log_api_usage_once(self) - self.neck = neck self.rpn = rpn self.roi_heads = roi_heads self.image_size = image_size def forward(self, features: FXTensorListType) -> DetectionModelOutput: - if self.neck: - features = self.neck(features) - features = {str(k): v for k, v in enumerate(features)} rpn_features = self.rpn(features, self.image_size) roi_features = self.roi_heads(features, rpn_features['boxes'], self.image_size) diff --git a/src/netspresso_trainer/models/heads/detection/experimental/faster_rcnn.py b/src/netspresso_trainer/models/heads/detection/experimental/faster_rcnn.py index 5127d4f2..6b9d46a0 100644 --- a/src/netspresso_trainer/models/heads/detection/experimental/faster_rcnn.py +++ b/src/netspresso_trainer/models/heads/detection/experimental/faster_rcnn.py @@ -2,7 +2,6 @@ import torch.nn.functional as F from .detection import AnchorGenerator, RPNHead, RegionProposalNetwork, RoIHeads, GeneralizedRCNN, MultiScaleRoIAlign -from .fpn import FPN IMAGE_SIZE = (512, 512) # TODO: Get from configuration @@ -43,8 +42,6 @@ def __init__( ): assert fpn_num_outs == len(anchor_sizes) - neck = FPN(in_channels=intermediate_features_dim, out_channels=intermediate_features_dim[-1], num_outs=fpn_num_outs) - out_channels = intermediate_features_dim[-1] aspect_ratios = (aspect_ratios,) * len(anchor_sizes) @@ -65,7 +62,7 @@ def __init__( score_thresh=rpn_score_thresh, ) - featmap_names = [str(i) for i in range(neck.num_outs)] + featmap_names = [str(i) for i in range(len(intermediate_features_dim))] box_roi_pool = MultiScaleRoIAlign(featmap_names=featmap_names, output_size=roi_output_size, sampling_ratio=roi_sampling_ratio) box_head = TwoMLPHead(out_channels * roi_output_size**2, roi_representation_size) @@ -87,7 +84,7 @@ def __init__( box_detections_per_img, ) - super().__init__(neck, rpn, roi_heads, IMAGE_SIZE) + super().__init__(rpn, roi_heads, IMAGE_SIZE) class TwoMLPHead(nn.Module): diff --git a/src/netspresso_trainer/models/heads/detection/experimental/yolo_head.py b/src/netspresso_trainer/models/heads/detection/experimental/yolox_head.py similarity index 95% rename from src/netspresso_trainer/models/heads/detection/experimental/yolo_head.py rename to src/netspresso_trainer/models/heads/detection/experimental/yolox_head.py index 9376a2d5..7135775b 100644 --- a/src/netspresso_trainer/models/heads/detection/experimental/yolo_head.py +++ b/src/netspresso_trainer/models/heads/detection/experimental/yolox_head.py @@ -6,7 +6,7 @@ import torch.nn as nn from ....op.custom import ConvLayer -from .fpn import PAFPN +from ....utils import ModelOutput class YOLOXHead(nn.Module): @@ -25,8 +25,6 @@ def __init__( self.num_classes = num_classes - self.neck = PAFPN(in_channels=intermediate_features_dim, act_type=act_type) - self.cls_convs = nn.ModuleList() self.reg_convs = nn.ModuleList() self.cls_preds = nn.ModuleList() @@ -116,7 +114,6 @@ def __init__( def forward(self, xin): outputs = [] - xin = self.neck(xin) for k, (cls_conv, reg_conv, x) in enumerate(zip(self.cls_convs, self.reg_convs, xin)): x = self.stems[k](x) @@ -134,10 +131,10 @@ def forward(self, xin): outputs.append(output) - return outputs + return ModelOutput(pred=outputs) -def yolo_head(num_classes, intermediate_features_dim, **kwargs): +def yolox_head(num_classes, intermediate_features_dim, **kwargs): configuration = { 'act_type': 'silu', } diff --git a/src/netspresso_trainer/models/necks/__init__.py b/src/netspresso_trainer/models/necks/__init__.py new file mode 100644 index 00000000..dfbeec68 --- /dev/null +++ b/src/netspresso_trainer/models/necks/__init__.py @@ -0,0 +1,2 @@ +from .experimental.fpn import fpn +from .experimental.pafpn import pafpn diff --git a/src/netspresso_trainer/models/necks/core/.gitkeep b/src/netspresso_trainer/models/necks/core/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/src/netspresso_trainer/models/necks/core/__init__.py b/src/netspresso_trainer/models/necks/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/netspresso_trainer/models/necks/experimental/__init__.py b/src/netspresso_trainer/models/necks/experimental/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/netspresso_trainer/models/heads/detection/experimental/fpn.py b/src/netspresso_trainer/models/necks/experimental/fpn.py similarity index 64% rename from src/netspresso_trainer/models/heads/detection/experimental/fpn.py rename to src/netspresso_trainer/models/necks/experimental/fpn.py index 27de479c..16ca0717 100644 --- a/src/netspresso_trainer/models/heads/detection/experimental/fpn.py +++ b/src/netspresso_trainer/models/necks/experimental/fpn.py @@ -1,8 +1,7 @@ -import torch import torch.nn as nn import torch.nn.functional as F -from ....op.custom import ConvLayer, CSPLayer +from ...utils import BackboneOutput class FPN(nn.Module): @@ -92,6 +91,8 @@ def __init__(self, extra_fpn_conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1) self.fpn_convs.append(extra_fpn_conv) + self._intermediate_features_dim = [out_channels for _ in range(num_outs)] + def forward(self, inputs): """Forward function.""" assert len(inputs) == len(self.in_channels) @@ -144,118 +145,28 @@ def forward(self, inputs): outs.append(self.fpn_convs[i](F.relu(outs[-1]))) else: outs.append(self.fpn_convs[i](outs[-1])) - return outs - - -class PAFPN(nn.Module): - """ - YOLOv3 model. Darknet 53 is the default backbone of this model. - """ - - def __init__( - self, - in_channels, - act_type="silu", - ): - super().__init__() - - self.in_channels = in_channels - Conv = ConvLayer - - # TODO: Get from config - depth = 0.33 - - self.upsample = nn.Upsample(scale_factor=2, mode="nearest") - self.lateral_conv0 = ConvLayer( - in_channels=int(in_channels[2]), - out_channels=int(in_channels[1]), - kernel_size=1, - stride=1, - act_type=act_type - ) - self.C3_p4 = CSPLayer( - in_channels=int(2 * in_channels[1]), - out_channels=int(in_channels[1]), - n=round(3 * depth), - shortcut=False, - act_type=act_type, - ) # cat - - self.reduce_conv1 = ConvLayer( - in_channels=int(in_channels[1]), - out_channels=int(in_channels[0]), - kernel_size=1, - stride=1, - act_type=act_type - ) - self.C3_p3 = CSPLayer( - in_channels=int(2 * in_channels[0]), - out_channels=int(in_channels[0]), - n=round(3 * depth), - shortcut=False, - act_type=act_type, - ) - - # bottom-up conv - self.bu_conv2 = Conv( - in_channels=int(in_channels[0]), - out_channels=int(in_channels[0]), - kernel_size=3, - stride=2, - act_type=act_type - ) - self.C3_n3 = CSPLayer( - in_channels=int(2 * in_channels[0]), - out_channels=int(in_channels[1]), - n=round(3 * depth), - shortcut=False, - act_type=act_type, - ) - - # bottom-up conv - self.bu_conv1 = Conv( - in_channels=int(in_channels[1]), - out_channels=int(in_channels[1]), - kernel_size=3, - stride=2, - act_type=act_type - ) - self.C3_n4 = CSPLayer( - in_channels=int(2 * in_channels[1]), - out_channels=int(in_channels[2]), - n=round(3 * depth), - shortcut=False, - act_type=act_type, - ) - - def forward(self, inputs): - """ - Args: - inputs: input images. - - Returns: - Tuple[Tensor]: FPN feature. - """ - - [x2, x1, x0] = inputs - - fpn_out0 = self.lateral_conv0(x0) # 1024->512/32 - f_out0 = self.upsample(fpn_out0) # 512/16 - f_out0 = torch.cat([f_out0, x1], 1) # 512->1024/16 - f_out0 = self.C3_p4(f_out0) # 1024->512/16 - - fpn_out1 = self.reduce_conv1(f_out0) # 512->256/16 - f_out1 = self.upsample(fpn_out1) # 256/8 - f_out1 = torch.cat([f_out1, x2], 1) # 256->512/8 - pan_out2 = self.C3_p3(f_out1) # 512->256/8 - - p_out1 = self.bu_conv2(pan_out2) # 256->256/16 - p_out1 = torch.cat([p_out1, fpn_out1], 1) # 256->512/16 - pan_out1 = self.C3_n3(p_out1) # 512->512/16 - - p_out0 = self.bu_conv1(pan_out1) # 512->512/32 - p_out0 = torch.cat([p_out0, fpn_out0], 1) # 512->1024/32 - pan_out0 = self.C3_n4(p_out0) # 1024->1024/32 - - outputs = (pan_out2, pan_out1, pan_out0) - return outputs + return BackboneOutput(intermediate_features=outs) + + @property + def intermediate_features_dim(self): + return self._intermediate_features_dim + + +def fpn(intermediate_features_dim, **kwargs): + configuration = { + 'num_outs': 4, + 'start_level': 0, + 'end_level': -1, + 'add_extra_convs': False, + 'relu_before_extra_convs': False, + 'no_norm_on_lateral': False, + 'conv_cfg': None, + 'norm_cfg': None, + 'act_cfg': None, + 'upsample_cfg': None, + 'init_cfg': None + } + + return FPN(in_channels=intermediate_features_dim, + out_channels=intermediate_features_dim[-1], + **configuration) diff --git a/src/netspresso_trainer/models/necks/experimental/pafpn.py b/src/netspresso_trainer/models/necks/experimental/pafpn.py new file mode 100644 index 00000000..50de7228 --- /dev/null +++ b/src/netspresso_trainer/models/necks/experimental/pafpn.py @@ -0,0 +1,132 @@ +import torch +import torch.nn as nn + +from ...op.custom import ConvLayer, CSPLayer +from ...utils import BackboneOutput + + +class PAFPN(nn.Module): + """ + YOLOv3 model. Darknet 53 is the default backbone of this model. + """ + + def __init__( + self, + in_channels, + act_type="silu", + ): + super().__init__() + + self.in_channels = in_channels + Conv = ConvLayer + + # TODO: Get from config + depth = 0.33 + + self.upsample = nn.Upsample(scale_factor=2, mode="nearest") + self.lateral_conv0 = ConvLayer( + in_channels=int(in_channels[2]), + out_channels=int(in_channels[1]), + kernel_size=1, + stride=1, + act_type=act_type + ) + self.C3_p4 = CSPLayer( + in_channels=int(2 * in_channels[1]), + out_channels=int(in_channels[1]), + n=round(3 * depth), + shortcut=False, + act_type=act_type, + ) # cat + + self.reduce_conv1 = ConvLayer( + in_channels=int(in_channels[1]), + out_channels=int(in_channels[0]), + kernel_size=1, + stride=1, + act_type=act_type + ) + self.C3_p3 = CSPLayer( + in_channels=int(2 * in_channels[0]), + out_channels=int(in_channels[0]), + n=round(3 * depth), + shortcut=False, + act_type=act_type, + ) + + # bottom-up conv + self.bu_conv2 = Conv( + in_channels=int(in_channels[0]), + out_channels=int(in_channels[0]), + kernel_size=3, + stride=2, + act_type=act_type + ) + self.C3_n3 = CSPLayer( + in_channels=int(2 * in_channels[0]), + out_channels=int(in_channels[1]), + n=round(3 * depth), + shortcut=False, + act_type=act_type, + ) + + # bottom-up conv + self.bu_conv1 = Conv( + in_channels=int(in_channels[1]), + out_channels=int(in_channels[1]), + kernel_size=3, + stride=2, + act_type=act_type + ) + self.C3_n4 = CSPLayer( + in_channels=int(2 * in_channels[1]), + out_channels=int(in_channels[2]), + n=round(3 * depth), + shortcut=False, + act_type=act_type, + ) + + self._intermediate_features_dim = in_channels + + def forward(self, inputs): + """ + Args: + inputs: input images. + + Returns: + Tuple[Tensor]: FPN feature. + """ + + [x2, x1, x0] = inputs + + fpn_out0 = self.lateral_conv0(x0) # 1024->512/32 + f_out0 = self.upsample(fpn_out0) # 512/16 + f_out0 = torch.cat([f_out0, x1], 1) # 512->1024/16 + f_out0 = self.C3_p4(f_out0) # 1024->512/16 + + fpn_out1 = self.reduce_conv1(f_out0) # 512->256/16 + f_out1 = self.upsample(fpn_out1) # 256/8 + f_out1 = torch.cat([f_out1, x2], 1) # 256->512/8 + pan_out2 = self.C3_p3(f_out1) # 512->256/8 + + p_out1 = self.bu_conv2(pan_out2) # 256->256/16 + p_out1 = torch.cat([p_out1, fpn_out1], 1) # 256->512/16 + pan_out1 = self.C3_n3(p_out1) # 512->512/16 + + p_out0 = self.bu_conv1(pan_out1) # 512->512/32 + p_out0 = torch.cat([p_out0, fpn_out0], 1) # 512->1024/32 + pan_out0 = self.C3_n4(p_out0) # 1024->1024/32 + + outputs = (pan_out2, pan_out1, pan_out0) + return BackboneOutput(intermediate_features=outputs) + + @property + def intermediate_features_dim(self): + return self._intermediate_features_dim + +def pafpn(intermediate_features_dim, **kwargs): + configuration = { + 'act_type': 'silu', + } + + return PAFPN(in_channels=intermediate_features_dim, **configuration) diff --git a/src/netspresso_trainer/models/op/base_metaformer.py b/src/netspresso_trainer/models/op/base_metaformer.py index 5a23d5cb..65237e74 100644 --- a/src/netspresso_trainer/models/op/base_metaformer.py +++ b/src/netspresso_trainer/models/op/base_metaformer.py @@ -53,7 +53,7 @@ def __init__( attention_bias_resolution = 16, ) -> None: super().__init__() - + attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size value_hidden_size = value_hidden_size if value_hidden_size is not None else attention_hidden_size @@ -62,17 +62,17 @@ def __init__( f"The hidden size {attention_hidden_size,} is not a multiple of the number of attention " f"heads {num_attention_heads}." ) - + if value_hidden_size % num_attention_heads != 0: raise ValueError( f"The hidden size {value_hidden_size,} is not a multiple of the number of attention " f"heads {num_attention_heads}." ) - + self.num_attention_heads = num_attention_heads self.attention_head_size = int(attention_hidden_size / num_attention_heads) self.value_attention_head_size = int(value_hidden_size / num_attention_heads) - + self.head_size = self.num_attention_heads * self.attention_head_size self.value_head_size = self.num_attention_heads * self.value_attention_head_size self.attention_scale = attention_scale if attention_scale is not None \ @@ -82,7 +82,7 @@ def __init__( self.query = nn.Linear(hidden_size, self.head_size, bias=use_qkv_bias) # ... x C -> ... x C_qk self.key = nn.Linear(hidden_size, self.head_size, bias=use_qkv_bias) # ... x C -> ... x C_qk self.value = nn.Linear(hidden_size, self.value_head_size, bias=use_qkv_bias) # ... x C -> ... x C_v - + self.linear = nn.Linear(self.value_head_size, hidden_size) # ... x C_v -> ... x C self.dropout = nn.Dropout(attention_dropout_prob) @@ -118,14 +118,14 @@ def __init__( # torch.zeros(self.num_attention_heads, len(attention_offsets))) # self.register_buffer('attention_bias_idxs_seg', # torch.LongTensor(idxs).view(N, N)) - + self.use_cross_attention = use_cross_attention def transpose_for_scores(self, x: Tensor, attention_head_size: int) -> Tensor: new_x_shape = x.size()[:-1] + (self.num_attention_heads, attention_head_size) x = x.view(new_x_shape) return x.permute(0, 2, 1, 3) - + def sequence_reduce(self, x: Tensor, height: int, width: int) -> Tensor: """SegFormer """ @@ -167,7 +167,7 @@ def forward( """ mixed_query_layer = self.query(query_states) # B x S_s x C_qk - + if not self.use_cross_attention: # Self-attention key_value_states = query_states # B x S_t(=S_s) x C_qk if self.use_sequence_reduction: @@ -180,7 +180,7 @@ def forward( attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) # B x {head} x S_s x S_t attention_scores = attention_scores / self.attention_scale # B x {head} x S_s x S_t - + if self.use_attention_bias: bias = self.attention_biases[:, self.attention_bias_idxs] bias = nn.functional.interpolate(bias.unsqueeze(0), size=(attention_scores.size(-2), attention_scores.size(-1)), mode='bicubic') @@ -199,15 +199,15 @@ def forward( context_layer = context_layer.permute(0, 2, 1, 3).contiguous() # B x S_s x {head} x C_vsplit new_context_layer_shape = context_layer.size()[:-2] + (self.value_head_size,) context_layer = context_layer.view(new_context_layer_shape) # B x S_s x C_v - + context_layer = self.linear(context_layer) # B x S_s x C context_layer = self.dropout(context_layer) # B x S_s x C if self.output_with_attentions: return (context_layer, attention_probs) - + return context_layer # B x S_s x C - + class ChannelMLP(nn.Module): def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob, hidden_activation_type='silu'): super().__init__() @@ -218,7 +218,7 @@ def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob, hidden_a self.ffn.add_module('dense2', nn.Linear(in_features=intermediate_size, out_features=hidden_size, bias=True)) self.dropout = nn.Dropout(p=hidden_dropout_prob) - + def forward(self, x): x = self.ffn(x) x = self.dropout(x) @@ -231,20 +231,20 @@ def __init__(self, hidden_size, layer_norm_eps) -> None: self.layernorm_after = nn.LayerNorm(hidden_size) self.token_mixer = nn.Identity() # MultiHeadAttention() self.channel_mlp = nn.Identity() # ChannelMLP() - + def forward(self, x): out_token_mixer = self.layernorm_before(x) out_token_mixer = self.token_mixer(out_token_mixer) - + out_token_mixer = out_token_mixer + x - + out_final = self.layernorm_after(out_token_mixer) out_final = self.channel_mlp(out_final) - + out_final = out_final + out_token_mixer - + return out_final - + class MetaFormerEncoder(nn.Module): def __init__(self) -> None: super().__init__() @@ -252,7 +252,7 @@ def __init__(self) -> None: # self.blocks = nn.Sequential( # *[MetaFormerBlock(hidden_size, layer_norm_eps) for _ in range(num_layers)] # ) - + def forward(self, x): x = self.blocks(x) return x @@ -262,7 +262,7 @@ def __init__(self, hidden_sizes) -> None: super().__init__() self._feature_dim = hidden_sizes[-1] self._intermediate_features_dim = hidden_sizes - + self.patch_embed = nn.Identity() self.encoder = MetaFormerEncoder() self.norm = nn.Identity() @@ -270,14 +270,14 @@ def __init__(self, hidden_sizes) -> None: @property def feature_dim(self): return self._feature_dim - + @property def intermediate_features_dim(self): return self._intermediate_features_dim - + def forward(self, x: FXTensorType): x = self.patch_embed(x) x = self.encoder(x) x = self.norm(x) feat = torch.mean(x, dim=1) - return BackboneOutput(last_feature=feat) \ No newline at end of file + return BackboneOutput(last_feature=feat) diff --git a/src/netspresso_trainer/models/op/custom.py b/src/netspresso_trainer/models/op/custom.py index 78715625..7dd1752c 100644 --- a/src/netspresso_trainer/models/op/custom.py +++ b/src/netspresso_trainer/models/op/custom.py @@ -296,9 +296,9 @@ def __init__( # project layers.append( ConvLayer( - in_channels=hidden_channels, - out_channels=out_channels, - kernel_size=1, + in_channels=hidden_channels, + out_channels=out_channels, + kernel_size=1, norm_type=norm_type, use_act=False ) @@ -365,7 +365,7 @@ def __init__( self.patch_dim = patch_dim self.register_buffer("pe", pos_encoding) - + def forward_patch_last( self, x, indices: Optional[Tensor] = None, *args, **kwargs ) -> Tensor: @@ -385,8 +385,8 @@ def forward_others( self, x, indices: Optional[Tensor] = None, *args, **kwargs ) -> Tensor: # seq_length should be the second last dim - - # @deepkyu: [fx tracing] Always `indices` is None + + # @deepkyu: [fx tracing] Always `indices` is None # if indices is None: # x = x + self.pe[..., : x.shape[-2], :] # else: @@ -396,10 +396,10 @@ def forward_others( # pe = self.pe.expand(repeat_size) # selected_pe = torch.gather(pe, index=indices, dim=-2) # x = x + selected_pe - + # x = x + self.pe[..., :seq_index, :] x = x + tensor_slice(self.pe, dim=1, index=x.shape[-2]) - + return x def forward(self, x, indices: Optional[Tensor] = None, *args, **kwargs) -> Tensor: @@ -480,7 +480,7 @@ def forward(self, x: Tensor) -> Tensor: # dims = [-3, -2, -1] # else: # raise NotImplementedError("Currently 2D and 3D global pooling supported") - + return self._global_pool(x, dims=(-2, -1)) # def profile_module(self, input: Tensor) -> Tuple[Tensor, float, float]: @@ -497,9 +497,9 @@ class Focus(nn.Module): def __init__(self, in_channels, out_channels, ksize=1, stride=1, act_type="silu"): super().__init__() self.conv = ConvLayer(in_channels=in_channels * 4, - out_channels=out_channels, - kernel_size=ksize, - stride=stride, + out_channels=out_channels, + kernel_size=ksize, + stride=stride, act_type=act_type) def forward(self, x): @@ -542,25 +542,25 @@ def __init__( # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() hidden_channels = int(out_channels * expansion) # hidden channels - self.conv1 = ConvLayer(in_channels=in_channels, + self.conv1 = ConvLayer(in_channels=in_channels, out_channels=hidden_channels, - kernel_size=1, + kernel_size=1, stride=1, act_type=act_type) self.conv2 = ConvLayer(in_channels=in_channels, - out_channels=hidden_channels, - kernel_size=1, + out_channels=hidden_channels, + kernel_size=1, stride=1, act_type=act_type) - self.conv3 = ConvLayer(in_channels=2 * hidden_channels, - out_channels=out_channels, - kernel_size=1, + self.conv3 = ConvLayer(in_channels=2 * hidden_channels, + out_channels=out_channels, + kernel_size=1, stride=1, act_type=act_type) - + block = DarknetBlock module_list = [ block( - in_channels=hidden_channels, - out_channels=hidden_channels, + in_channels=hidden_channels, + out_channels=hidden_channels, shortcut=shortcut, expansion=1.0, act_type=act_type @@ -585,7 +585,7 @@ def __init__( ): super().__init__() hidden_channels = in_channels // 2 - self.conv1 = ConvLayer(in_channels=in_channels, out_channels=hidden_channels, + self.conv1 = ConvLayer(in_channels=in_channels, out_channels=hidden_channels, kernel_size=1, stride=1, act_type=act_type) self.m = nn.ModuleList( [ @@ -594,7 +594,7 @@ def __init__( ] ) conv2_channels = hidden_channels * (len(kernel_sizes) + 1) - self.conv2 = ConvLayer(in_channels=conv2_channels, out_channels=out_channels, + self.conv2 = ConvLayer(in_channels=conv2_channels, out_channels=out_channels, kernel_size=1, stride=1, act_type=act_type) def forward(self, x): @@ -618,9 +618,9 @@ def __init__( ): super().__init__() hidden_channels = int(out_channels * expansion) - self.conv1 = ConvLayer(in_channels=in_channels, out_channels=hidden_channels, + self.conv1 = ConvLayer(in_channels=in_channels, out_channels=hidden_channels, kernel_size=1, stride=1, act_type=act_type) - self.conv2 = ConvLayer(in_channels=hidden_channels, out_channels=out_channels, + self.conv2 = ConvLayer(in_channels=hidden_channels, out_channels=out_channels, kernel_size=3, stride=1, act_type=act_type) self.use_add = shortcut and in_channels == out_channels diff --git a/src/netspresso_trainer/models/op/depth.py b/src/netspresso_trainer/models/op/depth.py index 9c626bb7..276527d3 100644 --- a/src/netspresso_trainer/models/op/depth.py +++ b/src/netspresso_trainer/models/op/depth.py @@ -31,4 +31,4 @@ def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): self.scale_by_keep = scale_by_keep def forward(self, x): - return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) \ No newline at end of file + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) diff --git a/src/netspresso_trainer/models/op/registry.py b/src/netspresso_trainer/models/op/registry.py index 4666e9f5..10c6f8cb 100644 --- a/src/netspresso_trainer/models/op/registry.py +++ b/src/netspresso_trainer/models/op/registry.py @@ -16,4 +16,4 @@ 'silu': nn.SiLU, 'swish': nn.SiLU, 'hard_swish': nn.Hardswish, -} \ No newline at end of file +} diff --git a/src/netspresso_trainer/models/registry.py b/src/netspresso_trainer/models/registry.py index 59ee6cda..999ed517 100644 --- a/src/netspresso_trainer/models/registry.py +++ b/src/netspresso_trainer/models/registry.py @@ -3,20 +3,27 @@ import torch.nn as nn -from .backbones import cspdarknet, efficientformer, mobilenetv3_small, mobilevit, resnet50, segformer, vit +from .backbones import cspdarknet, efficientformer, mixnet, mobilenetv3, mobilevit, resnet, segformer, vit from .full import pidnet from .heads.classification import fc -from .heads.detection import faster_rcnn, yolo_head +from .heads.detection import faster_rcnn, yolox_head from .heads.segmentation import all_mlp_decoder +from .necks import fpn, pafpn MODEL_BACKBONE_DICT: Dict[str, Callable[..., nn.Module]] = { - 'resnet50': resnet50, - 'mobilenetv3_small': mobilenetv3_small, + 'resnet': resnet, + 'mobilenetv3': mobilenetv3, 'segformer': segformer, 'mobilevit': mobilevit, 'vit': vit, 'efficientformer': efficientformer, - 'cspdarknet': cspdarknet + 'cspdarknet': cspdarknet, + 'mixnet': mixnet, +} + +MODEL_NECK_DICT: Dict[str, Callable[..., nn.Module]] = { + 'fpn': fpn, + 'pafpn': pafpn, } MODEL_HEAD_DICT: Dict[str, Callable[..., nn.Module]] = { @@ -28,7 +35,7 @@ }, 'detection': { 'faster_rcnn': faster_rcnn, - 'yolo_head': yolo_head + 'yolox_head': yolox_head }, } diff --git a/src/netspresso_trainer/models/utils.py b/src/netspresso_trainer/models/utils.py index 0307eb6a..aa34c6a5 100644 --- a/src/netspresso_trainer/models/utils.py +++ b/src/netspresso_trainer/models/utils.py @@ -13,6 +13,18 @@ FXTensorType = Union[Tensor, Proxy] FXTensorListType = Union[List[Tensor], List[Proxy]] +MODEL_CHECKPOINT_URL_DICT = { + 'resnet50': "https://netspresso-trainer-public.s3.ap-northeast-2.amazonaws.com/checkpoint/resnet/resnet50.pth", + 'mobilenet_v3_small': "https://netspresso-trainer-public.s3.ap-northeast-2.amazonaws.com/checkpoint/mobilenetv3/mobilenet_v3_small.pth", + 'segformer': "https://netspresso-trainer-public.s3.ap-northeast-2.amazonaws.com/checkpoint/segformer/segformer.pth", + 'mobilevit': "https://netspresso-trainer-public.s3.ap-northeast-2.amazonaws.com/checkpoint/mobilevit/mobilevit_s.pth", + 'vit': "https://netspresso-trainer-public.s3.ap-northeast-2.amazonaws.com/checkpoint/vit/vit-tiny.pth", + 'efficientformer': "https://netspresso-trainer-public.s3.ap-northeast-2.amazonaws.com/checkpoint/efficientformer/efficientformer_l1_1000d.pth", + 'mixnet_s': "https://netspresso-trainer-public.s3.ap-northeast-2.amazonaws.com/checkpoint/mixnet/mixnet_s.pth", + 'mixnet_m': "https://netspresso-trainer-public.s3.ap-northeast-2.amazonaws.com/checkpoint/mixnet/mixnet_m.pth", + 'mixnet_l': "https://netspresso-trainer-public.s3.ap-northeast-2.amazonaws.com/checkpoint/mixnet/mixnet_l.pth", +} + class BackboneOutput(TypedDict): intermediate_features: Optional[FXTensorListType] @@ -43,8 +55,29 @@ class PIDNetModelOutput(ModelOutput): extra_d: Optional[FXTensorType] -def load_from_checkpoint(model: nn.Module, model_checkpoint: Optional[Union[str, Path]]) -> nn.Module: +def download_model_checkpoint(model_checkpoint: Union[str, Path], model_name: str) -> Path: + checkpoint_url = MODEL_CHECKPOINT_URL_DICT[model_name] + model_checkpoint = Path(model_checkpoint) + model_checkpoint.parent.mkdir(parents=True, exist_ok=True) + # Safer switch: only extension, user can use the custom name for checkpoint file + model_checkpoint = model_checkpoint.with_suffix(Path(checkpoint_url).suffix) + if not model_checkpoint.exists(): + torch.hub.download_url_to_file(checkpoint_url, model_checkpoint) + + return model_checkpoint + + +def load_from_checkpoint( + model: nn.Module, + model_checkpoint: Optional[Union[str, Path]] +) -> nn.Module: if model_checkpoint is not None: + if not Path(model_checkpoint).exists(): + model_name = Path(model_checkpoint).stem + assert model_name in MODEL_CHECKPOINT_URL_DICT, \ + f"model_name {model_name} in path {model_checkpoint} is not valid name!" + model_checkpoint = download_model_checkpoint(model_checkpoint, model_name) + model_state_dict = torch.load(model_checkpoint, map_location='cpu') missing_keys, unexpected_keys = model.load_state_dict(model_state_dict, strict=False) diff --git a/src/netspresso_trainer/optimizers/__init__.py b/src/netspresso_trainer/optimizers/__init__.py index b1b64e76..871ebc6e 100644 --- a/src/netspresso_trainer/optimizers/__init__.py +++ b/src/netspresso_trainer/optimizers/__init__.py @@ -1 +1 @@ -from .builder import build_optimizer \ No newline at end of file +from .builder import build_optimizer diff --git a/src/netspresso_trainer/optimizers/builder.py b/src/netspresso_trainer/optimizers/builder.py index 3d2d2bf6..460af3fe 100644 --- a/src/netspresso_trainer/optimizers/builder.py +++ b/src/netspresso_trainer/optimizers/builder.py @@ -21,7 +21,7 @@ def build_optimizer( 'adam', 'adamw', 'adamax', 'adadelta', 'adagrad', 'rmsprop'] = opt.lower() assert opt_name in OPTIMIZER_DICT - + conf_optim = {'weight_decay': wd, 'lr': lr} if opt_name in ['sgd', 'nesterov', 'momentum', 'rmsprop']: @@ -32,7 +32,7 @@ def build_optimizer( conf_optim.update({'nesterov': True}) if opt_name in ['momentum']: conf_optim.update({'nesterov': False}) - + optimizer = OPTIMIZER_DICT[opt_name](parameters, **conf_optim) return optimizer diff --git a/src/netspresso_trainer/optimizers/registry.py b/src/netspresso_trainer/optimizers/registry.py index f4c3fc22..4f1ff759 100644 --- a/src/netspresso_trainer/optimizers/registry.py +++ b/src/netspresso_trainer/optimizers/registry.py @@ -14,4 +14,4 @@ 'sgd': optim.SGD, 'nesterov': optim.SGD, 'momentum': optim.SGD, -} \ No newline at end of file +} diff --git a/src/netspresso_trainer/pipelines/base.py b/src/netspresso_trainer/pipelines/base.py index f43fb2dd..58dd7aa6 100644 --- a/src/netspresso_trainer/pipelines/base.py +++ b/src/netspresso_trainer/pipelines/base.py @@ -15,6 +15,7 @@ from ..losses import build_losses from ..metrics import build_metrics from ..optimizers import build_optimizer +from ..postprocessors import build_postprocessor from ..schedulers import build_scheduler from ..utils.fx import save_graphmodule from ..utils.logger import yaml_for_logging @@ -87,6 +88,7 @@ def set_train(self): self.scheduler, _ = build_scheduler(self.optimizer, self.conf.training) self.loss_factory = build_losses(self.conf.model, ignore_index=self.ignore_index) self.metric_factory = build_metrics(self.task, self.conf.model, ignore_index=self.ignore_index, num_classes=self.num_classes) + self.postprocessor = build_postprocessor(self.task, self.conf.model) resume_optimizer_checkpoint = self.conf.model.resume_optimizer_checkpoint if resume_optimizer_checkpoint is not None: resume_optimizer_checkpoint = Path(resume_optimizer_checkpoint) diff --git a/src/netspresso_trainer/pipelines/builder.py b/src/netspresso_trainer/pipelines/builder.py index 4773261c..18698855 100644 --- a/src/netspresso_trainer/pipelines/builder.py +++ b/src/netspresso_trainer/pipelines/builder.py @@ -9,9 +9,9 @@ def build_pipeline(conf, task, model_name, model, devices, train_dataloader, eva task_ = "detection-two-stage" if conf.model.architecture.head.name in ["faster_rcnn"] else "detection-one-stage" task_pipeline = TASK_PIPELINE[task_] - + trainer = task_pipeline(conf, task, model_name, model, devices, train_dataloader, eval_dataloader, class_map, is_graphmodule_training=is_graphmodule_training, profile=profile) - return trainer \ No newline at end of file + return trainer diff --git a/src/netspresso_trainer/pipelines/classification.py b/src/netspresso_trainer/pipelines/classification.py index 709da58b..53c48fc5 100644 --- a/src/netspresso_trainer/pipelines/classification.py +++ b/src/netspresso_trainer/pipelines/classification.py @@ -28,7 +28,10 @@ def train_step(self, batch): out = self.model(images) self.loss_factory.calc(out, target, phase='train') - self.metric_factory.calc(out['pred'], target, phase='train') + if target.dim() > 1: # Soft label to label number + target = torch.argmax(target, dim=-1) + pred = self.postprocessor(out) + self.metric_factory.calc(pred, target, phase='train') self.loss_factory.backward() self.optimizer.step() @@ -44,7 +47,10 @@ def valid_step(self, batch): out = self.model(images) self.loss_factory.calc(out, target, phase='valid') - self.metric_factory.calc(out['pred'], target, phase='valid') + if target.dim() > 1: # Soft label to label number + target = torch.argmax(target, dim=-1) + pred = self.postprocessor(out) + self.metric_factory.calc(pred, target, phase='valid') if self.conf.distributed: torch.distributed.barrier() @@ -55,7 +61,7 @@ def test_step(self, batch): images = images.to(self.devices) out = self.model(images.unsqueeze(0)) - _, pred = out['pred'].topk(1, 1, True, True) + pred = self.postprocessor(out, k=1) if self.conf.distributed: torch.distributed.barrier() diff --git a/src/netspresso_trainer/pipelines/detection.py b/src/netspresso_trainer/pipelines/detection.py index 3d7e1ab5..b3897fc8 100644 --- a/src/netspresso_trainer/pipelines/detection.py +++ b/src/netspresso_trainer/pipelines/detection.py @@ -33,6 +33,15 @@ def __init__(self, conf, task, model_name, model, devices, train_dataloader, eva model = model.to(device=devices) self.model = model + if conf.distributed: + self.backbone_to_train = model.module.backbone + self.neck_to_train = model.module.neck + self.head_to_train = model.module.head + else: + self.backbone_to_train = model.backbone + self.neck = model.neck + self.head_to_train = model.head + def train_step(self, batch): self.model.train() images, labels, bboxes = batch['pixel_values'], batch['label'], batch['bbox'] @@ -43,12 +52,12 @@ def train_step(self, batch): self.optimizer.zero_grad() # forward to rpn - backbone = self.model.backbone - head = self.model.head + backbone = self.backbone_to_train + neck = self.neck_to_train + head = self.head_to_train features = backbone(images)['intermediate_features'] - if head.neck: - features = head.neck(features) + features = neck(features)['intermediate_features'] features = {str(k): v for k, v in enumerate(features)} rpn_features = head.rpn(features, head.image_size) @@ -86,7 +95,7 @@ def valid_step(self, batch): out = self.model(images) # Compute loss - head = self.model.head + head = self.head_to_train matched_idxs, roi_head_labels = head.roi_heads.assign_targets_to_proposals(out['boxes'], bboxes, labels) matched_gt_boxes = [bbox[idx] for idx, bbox in zip(matched_idxs, bboxes)] regression_targets = head.roi_heads.box_coder.encode(matched_gt_boxes, out['boxes']) @@ -139,7 +148,7 @@ def get_metric_with_all_outputs(self, outputs, phase: Literal['train', 'valid']) pred_on_image['post_labels'] = class_idx pred.append(pred_on_image) self.metric_factory.calc(pred, target=targets, phase=phase) - + def save_checkpoint(self, epoch: int): # Check whether the valid loss is minimum at this epoch @@ -211,9 +220,9 @@ def train_step(self, batch): images = images.to(self.devices) targets = [{"boxes": box.to(self.devices), "labels": label.to(self.devices),} for box, label in zip(bboxes, labels)] - - targets = {'gt': targets, - 'img_size': images.size(-1), + + targets = {'gt': targets, + 'img_size': images.size(-1), 'num_classes': self.num_classes,} self.optimizer.zero_grad() @@ -224,9 +233,7 @@ def train_step(self, batch): self.loss_factory.backward() self.optimizer.step() - # TODO: This step will be moved to postprocessor module - pred = self.decode_outputs(out, dtype=out[0].type(), stage_strides=[images.shape[-1] // o.shape[-1] for o in out]) - pred = self.postprocess(pred, self.num_classes) + pred = self.postprocessor(out, original_shape=images[0].shape, num_classes=self.num_classes) if self.conf.distributed: torch.distributed.barrier() @@ -235,7 +242,7 @@ def train_step(self, batch): 'target': [(bbox.detach().cpu().numpy(), label.detach().cpu().numpy()) for bbox, label in zip(bboxes, labels)], 'pred': [(torch.cat([p[:, :4], p[:, 5:6]], dim=-1).detach().cpu().numpy(), - p[:, 6].to(torch.int).detach().cpu().numpy()) + p[:, 6].to(torch.int).detach().cpu().numpy()) if p is not None else (np.array([[]]), np.array([])) for p in pred] } @@ -247,9 +254,9 @@ def valid_step(self, batch): images = images.to(self.devices) targets = [{"boxes": box.to(self.devices), "labels": label.to(self.devices)} for box, label in zip(bboxes, labels)] - - targets = {'gt': targets, - 'img_size': images.size(-1), + + targets = {'gt': targets, + 'img_size': images.size(-1), 'num_classes': self.num_classes,} self.optimizer.zero_grad() @@ -257,9 +264,7 @@ def valid_step(self, batch): out = self.model(images) self.loss_factory.calc(out, targets, phase='valid') - # TODO: This step will be moved to postprocessor module - pred = self.decode_outputs(out, dtype=out[0].type(), stage_strides=[images.shape[-1] // o.shape[-1] for o in out]) - pred = self.postprocess(pred, self.num_classes) + pred = self.postprocessor(out, original_shape=images[0].shape, num_classes=self.num_classes) if self.conf.distributed: torch.distributed.barrier() @@ -269,7 +274,7 @@ def valid_step(self, batch): 'target': [(bbox.detach().cpu().numpy(), label.detach().cpu().numpy()) for bbox, label in zip(bboxes, labels)], 'pred': [(torch.cat([p[:, :4], p[:, 5:6]], dim=-1).detach().cpu().numpy(), - p[:, 6].to(torch.int).detach().cpu().numpy()) + p[:, 6].to(torch.int).detach().cpu().numpy()) if p is not None else (np.array([[]]), np.array([])) for p in pred] } @@ -282,9 +287,7 @@ def test_step(self, batch): out = self.model(images.unsqueeze(0)) - # TODO: This step will be moved to postprocessor module - pred = self.decode_outputs(out, dtype=out[0].type(), stage_strides=[images.shape[-1] // o.shape[-1] for o in out]) - pred = self.postprocess(pred, self.num_classes) + pred = self.postprocessor(out, original_shape=images[0].shape, num_classes=self.num_classes) results = [(p[:, :4].detach().cpu().numpy(), p[:, 6].to(torch.int).detach().cpu().numpy()) if p is not None else (np.array([[]]), np.array([])) @@ -309,73 +312,3 @@ def get_metric_with_all_outputs(self, outputs, phase: Literal['train', 'valid']) pred_on_image['post_labels'] = class_idx pred.append(pred_on_image) self.metric_factory.calc(pred, target=targets, phase=phase) - - # TODO: Temporary defined in pipeline, it will be moved to postprocessor module. - def decode_outputs(self, outputs, dtype, stage_strides): - hw = [x.shape[-2:] for x in outputs] - # [batch, n_anchors_all, num_classes + 5] - outputs = torch.cat([x.flatten(start_dim=2) for x in outputs], dim=2).permute(0, 2, 1) - outputs[..., 4:] = outputs[..., 4:].sigmoid() - - grids = [] - strides = [] - for (hsize, wsize), stride in zip(hw, stage_strides): - yv, xv = torch.meshgrid(torch.arange(hsize), torch.arange(wsize), indexing='ij') - grid = torch.stack((xv, yv), 2).view(1, -1, 2) - grids.append(grid) - shape = grid.shape[:2] - strides.append(torch.full((*shape, 1), stride)) - - grids = torch.cat(grids, dim=1).type(dtype) - strides = torch.cat(strides, dim=1).type(dtype) - - outputs = torch.cat([ - (outputs[..., 0:2] + grids) * strides, - torch.exp(outputs[..., 2:4]) * strides, - outputs[..., 4:] - ], dim=-1) - return outputs - - # TODO: Temporary defined in pipeline, it will be moved to postprocessor module. - def postprocess(self, prediction, num_classes, conf_thre=0.7, nms_thre=0.45, class_agnostic=False): - box_corner = prediction.new(prediction.shape) - box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 - box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 - box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 - box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 - prediction[:, :, :4] = box_corner[:, :, :4] - - output = [torch.zeros(0, 7).to(prediction.device) for i in range(len(prediction))] - for i, image_pred in enumerate(prediction): - - # If none are remaining => process next image - if not image_pred.size(0): - continue - # Get score and class with highest confidence - class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True) - - conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze() - # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred) - detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1) - detections = detections[conf_mask] - if not detections.size(0): - continue - - if class_agnostic: - nms_out_index = torchvision.ops.nms( - detections[:, :4], - detections[:, 4] * detections[:, 5], - nms_thre, - ) - else: - nms_out_index = torchvision.ops.batched_nms( - detections[:, :4], - detections[:, 4] * detections[:, 5], - detections[:, 6], - nms_thre, - ) - - detections = detections[nms_out_index] - output[i] = torch.cat((output[i], detections)) - - return output diff --git a/src/netspresso_trainer/pipelines/registry.py b/src/netspresso_trainer/pipelines/registry.py index 61fb0dd2..b0110bee 100644 --- a/src/netspresso_trainer/pipelines/registry.py +++ b/src/netspresso_trainer/pipelines/registry.py @@ -13,4 +13,4 @@ 'segmentation': SegmentationPipeline, 'detection-two-stage': TwoStageDetectionPipeline, 'detection-one-stage': OneStageDetectionPipeline, -} \ No newline at end of file +} diff --git a/src/netspresso_trainer/pipelines/segmentation.py b/src/netspresso_trainer/pipelines/segmentation.py index 1da5af67..782d4952 100644 --- a/src/netspresso_trainer/pipelines/segmentation.py +++ b/src/netspresso_trainer/pipelines/segmentation.py @@ -41,7 +41,8 @@ def train_step(self, batch): self.optimizer.step() out = {k: v.detach() for k, v in out.items()} - self.metric_factory.calc(out['pred'], target, phase='train') + pred = self.postprocessor(out) + self.metric_factory.calc(pred, target, phase='train') if self.conf.distributed: torch.distributed.barrier() @@ -62,7 +63,8 @@ def valid_step(self, batch): else: self.loss_factory.calc(out, target, phase='valid') - self.metric_factory.calc(out['pred'], target, phase='valid') + pred = self.postprocessor(out) + self.metric_factory.calc(pred, target, phase='valid') if self.conf.distributed: torch.distributed.barrier() @@ -87,9 +89,9 @@ def test_step(self, batch): out = self.model(images.unsqueeze(0)) - output_seg = torch.max(out['pred'], dim=1)[1] # argmax + pred = self.postprocessor(out) - return output_seg + return pred def get_metric_with_all_outputs(self, outputs, phase: Literal['train', 'valid']): pass diff --git a/src/netspresso_trainer/postprocessors/__init__.py b/src/netspresso_trainer/postprocessors/__init__.py new file mode 100644 index 00000000..5fddd44e --- /dev/null +++ b/src/netspresso_trainer/postprocessors/__init__.py @@ -0,0 +1 @@ +from .builder import build_postprocessor diff --git a/src/netspresso_trainer/postprocessors/builder.py b/src/netspresso_trainer/postprocessors/builder.py new file mode 100644 index 00000000..abb93ffe --- /dev/null +++ b/src/netspresso_trainer/postprocessors/builder.py @@ -0,0 +1,8 @@ +from .register import POSTPROCESSOR_DICT + + +def build_postprocessor(task: str, conf_model): + head_name = conf_model.architecture.full.name if conf_model.single_task_model else conf_model.architecture.head.name + if head_name not in POSTPROCESSOR_DICT: + return None + return POSTPROCESSOR_DICT[head_name](conf_model) diff --git a/src/netspresso_trainer/postprocessors/classification.py b/src/netspresso_trainer/postprocessors/classification.py new file mode 100644 index 00000000..37969325 --- /dev/null +++ b/src/netspresso_trainer/postprocessors/classification.py @@ -0,0 +1,18 @@ +from typing import Optional + +from ..models.utils import ModelOutput + +TOPK_MAX = 20 + + +class ClassificationPostprocessor(): + def __init__(self, conf_model): + pass + + def __call__(self, outputs: ModelOutput, k: Optional[int]=None): + pred = outputs['pred'] + maxk = min(TOPK_MAX, pred.size()[1]) + if k: + maxk = min(k, maxk) + _, pred = pred.topk(maxk, 1, True, True) + return pred diff --git a/src/netspresso_trainer/postprocessors/detection.py b/src/netspresso_trainer/postprocessors/detection.py new file mode 100644 index 00000000..8e710ac7 --- /dev/null +++ b/src/netspresso_trainer/postprocessors/detection.py @@ -0,0 +1,96 @@ +import torch +import torchvision + +from ..models.utils import ModelOutput + + +def yolox_decode_outputs(pred, original_shape): + dtype = pred[0].type() + stage_strides= [original_shape[-1] // o.shape[-1] for o in pred] + + hw = [x.shape[-2:] for x in pred] + # [batch, n_anchors_all, num_classes + 5] + pred = torch.cat([x.flatten(start_dim=2) for x in pred], dim=2).permute(0, 2, 1) + pred[..., 4:] = pred[..., 4:].sigmoid() + + grids = [] + strides = [] + for (hsize, wsize), stride in zip(hw, stage_strides): + yv, xv = torch.meshgrid(torch.arange(hsize), torch.arange(wsize), indexing='ij') + grid = torch.stack((xv, yv), 2).view(1, -1, 2) + grids.append(grid) + shape = grid.shape[:2] + strides.append(torch.full((*shape, 1), stride)) + + grids = torch.cat(grids, dim=1).type(dtype) + strides = torch.cat(strides, dim=1).type(dtype) + + pred = torch.cat([ + (pred[..., 0:2] + grids) * strides, + torch.clamp(torch.exp(pred[..., 2:4]) * strides, min=torch.iinfo(torch.int32).min, max=torch.iinfo(torch.int32).max), + pred[..., 4:] + ], dim=-1) + + box_corner = pred.new(pred.shape) + box_corner[:, :, 0] = pred[:, :, 0] - pred[:, :, 2] / 2 + box_corner[:, :, 1] = pred[:, :, 1] - pred[:, :, 3] / 2 + box_corner[:, :, 2] = pred[:, :, 0] + pred[:, :, 2] / 2 + box_corner[:, :, 3] = pred[:, :, 1] + pred[:, :, 3] / 2 + pred[:, :, :4] = box_corner[:, :, :4] + return pred + + +def nms(prediction, num_classes, conf_thre=0.7, nms_thre=0.45, class_agnostic=False): + output = [torch.zeros(0, 7).to(prediction.device) for i in range(len(prediction))] + for i, image_pred in enumerate(prediction): + + # If none are remaining => process next image + if not image_pred.size(0): + continue + # Get score and class with highest confidence + class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True) + + conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze() + # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred) + detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1) + detections = detections[conf_mask] + if not detections.size(0): + continue + + if class_agnostic: + nms_out_index = torchvision.ops.nms( + detections[:, :4], + detections[:, 4] * detections[:, 5], + nms_thre, + ) + else: + nms_out_index = torchvision.ops.batched_nms( + detections[:, :4], + detections[:, 4] * detections[:, 5], + detections[:, 6], + nms_thre, + ) + + detections = detections[nms_out_index] + output[i] = torch.cat((output[i], detections)) + + return output + + +class DetectionPostprocessor: + def __init__(self, conf_model): + HEAD_POSTPROCESS_MAPPING = { + 'yolox_head': [yolox_decode_outputs, nms] + } + + head_name = conf_model.architecture.head.name + self.decode_outputs, self.postprocess = HEAD_POSTPROCESS_MAPPING[head_name] + + def __call__(self, outputs: ModelOutput, original_shape, num_classes, conf_thresh=0.7, nms_thre=0.45, class_agnostic=False): + pred = outputs['pred'] + + if self.decode_outputs: + pred = self.decode_outputs(pred, original_shape) + if self.postprocess: + pred = self.postprocess(pred, num_classes=num_classes, conf_thre=conf_thresh, nms_thre=nms_thre, class_agnostic=class_agnostic) + return pred diff --git a/src/netspresso_trainer/postprocessors/register.py b/src/netspresso_trainer/postprocessors/register.py new file mode 100644 index 00000000..fc137c79 --- /dev/null +++ b/src/netspresso_trainer/postprocessors/register.py @@ -0,0 +1,12 @@ +from typing import Dict, Type + +from .classification import ClassificationPostprocessor +from .detection import DetectionPostprocessor +from .segmentation import SegmentationPostprocessor + +POSTPROCESSOR_DICT = { + 'fc': ClassificationPostprocessor, + 'all_mlp_decoder': SegmentationPostprocessor, + 'yolox_head': DetectionPostprocessor, + 'pidnet': SegmentationPostprocessor, +} diff --git a/src/netspresso_trainer/postprocessors/segmentation.py b/src/netspresso_trainer/postprocessors/segmentation.py new file mode 100644 index 00000000..f40334c1 --- /dev/null +++ b/src/netspresso_trainer/postprocessors/segmentation.py @@ -0,0 +1,15 @@ +from typing import Any, Optional + +import torch + +from ..models.utils import ModelOutput + + +class SegmentationPostprocessor: + def __init__(self, conf_model): + pass + + def __call__(self, outputs: ModelOutput): + pred = outputs['pred'] + pred = torch.max(pred, dim=1)[1] # argmax + return pred diff --git a/src/netspresso_trainer/schedulers/builder.py b/src/netspresso_trainer/schedulers/builder.py index ce566554..df4b49f3 100644 --- a/src/netspresso_trainer/schedulers/builder.py +++ b/src/netspresso_trainer/schedulers/builder.py @@ -15,8 +15,8 @@ def build_scheduler(optimizer, conf_training): 'total_iters': num_epochs, 'iters_per_phase': conf_training.iters_per_phase, # TODO: config for StepLR }) - + assert scheduler_name in SCHEDULER_DICT, f"{scheduler_name} not in scheduler dict!" lr_scheduler = SCHEDULER_DICT[scheduler_name](optimizer, **conf_sched) - + return lr_scheduler, num_epochs diff --git a/src/netspresso_trainer/schedulers/cosine_lr.py b/src/netspresso_trainer/schedulers/cosine_lr.py index aac4d301..e24286a4 100644 --- a/src/netspresso_trainer/schedulers/cosine_lr.py +++ b/src/netspresso_trainer/schedulers/cosine_lr.py @@ -31,10 +31,10 @@ def get_lr(self): if not self._get_lr_called_within_step: warnings.warn("To get the last learning rate computed by the scheduler, " "please use `get_last_lr()`.", UserWarning, stacklevel=2) - + if self.last_epoch > self.T_max: return [group['lr'] for group in self.optimizer.param_groups] - + if self.last_epoch >= 0 and self.last_epoch < self.warmup_iters: return [self.warmup_bias_lr + (float(self.last_epoch + 1) / float(max(1, self.warmup_iters))) * (base_lr - self.warmup_bias_lr) for base_lr in self.base_lrs] @@ -63,4 +63,4 @@ def _get_closed_form_lr(self): ) ) for base_lr in self.base_lrs - ] \ No newline at end of file + ] diff --git a/src/netspresso_trainer/schedulers/cosine_warm_restart.py b/src/netspresso_trainer/schedulers/cosine_warm_restart.py index 4e114142..ba0f501f 100644 --- a/src/netspresso_trainer/schedulers/cosine_warm_restart.py +++ b/src/netspresso_trainer/schedulers/cosine_warm_restart.py @@ -91,7 +91,7 @@ def get_reassigned_t_i(current_t_i, next_t_i, remain_epochs): return remain_epochs, remain_epochs return current_t_i, remain_epochs - + def _step_without_given_epoch(self) -> int: if self.last_epoch < 0: epoch = 0 @@ -105,7 +105,7 @@ def _step_without_given_epoch(self) -> int: self.T_i = self.T_i * self.T_mult self.T_i, self.remain_iters = self.get_reassigned_t_i(self.T_i, self.T_i * self.T_mult, self.remain_iters) return epoch - + def step(self, epoch=None): """Step could be called after every batch update @@ -139,7 +139,7 @@ def step(self, epoch=None): else: if epoch < 0: raise ValueError("Expected non-negative epoch, but got {}".format(epoch)) - + if epoch >= self.T_0: if self.T_mult == 1: self.T_cur = epoch % self.T_0 diff --git a/src/netspresso_trainer/schedulers/poly_lr.py b/src/netspresso_trainer/schedulers/poly_lr.py index 9c9b21c2..d7f62461 100644 --- a/src/netspresso_trainer/schedulers/poly_lr.py +++ b/src/netspresso_trainer/schedulers/poly_lr.py @@ -34,7 +34,7 @@ def get_lr(self): if self.last_epoch > self.total_iters: return [group["lr"] for group in self.optimizer.param_groups] - + if self.last_epoch >= 0 and self.last_epoch < self.warmup_iters: return [self.warmup_bias_lr + (float(self.last_epoch + 1) / float(max(1, self.warmup_iters))) * (base_lr - self.warmup_bias_lr) for base_lr in self.base_lrs] @@ -45,7 +45,7 @@ def get_lr(self): return [self.min_lr + (group["lr"] - self.min_lr) * decay_factor for group in self.optimizer.param_groups] def _get_closed_form_lr(self): - decay_steps = self.total_iters - self.warmup_iters + decay_steps = self.total_iters - self.warmup_iters return [ ( min( @@ -54,4 +54,4 @@ def _get_closed_form_lr(self): ) ) for base_lr in self.base_lrs - ] \ No newline at end of file + ] diff --git a/src/netspresso_trainer/schedulers/registry.py b/src/netspresso_trainer/schedulers/registry.py index d389ae0d..992a9d3c 100644 --- a/src/netspresso_trainer/schedulers/registry.py +++ b/src/netspresso_trainer/schedulers/registry.py @@ -13,4 +13,4 @@ 'cosine_no_sgdr': CosineAnnealingLRWithCustomWarmUp, 'poly': PolynomialLRWithWarmUp, 'step': StepLR -} \ No newline at end of file +} diff --git a/src/netspresso_trainer/schedulers/step_lr.py b/src/netspresso_trainer/schedulers/step_lr.py index 26776e7c..ab97636f 100644 --- a/src/netspresso_trainer/schedulers/step_lr.py +++ b/src/netspresso_trainer/schedulers/step_lr.py @@ -52,4 +52,4 @@ def get_lr(self): def _get_closed_form_lr(self): return [base_lr * self.gamma ** (self.last_epoch // self.step_size) - for base_lr in self.base_lrs] \ No newline at end of file + for base_lr in self.base_lrs] diff --git a/src/netspresso_trainer/trainer_cli.py b/src/netspresso_trainer/trainer_cli.py index 56cde7b8..e0fa0c64 100644 --- a/src/netspresso_trainer/trainer_cli.py +++ b/src/netspresso_trainer/trainer_cli.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Union +import torch from omegaconf import DictConfig, OmegaConf from netspresso_trainer.trainer_common import train_common @@ -12,17 +13,17 @@ def run_distributed_training_script(gpu_ids, data, augmentation, model, training, logging, environment, log_level): - + command = [ "--data", data, - "--augmentation", augmentation, + "--augmentation", augmentation, "--model", model, "--training", training, "--logging", logging, "--environment", environment, "--log_level", log_level, ] - + # Distributed training script command = [ 'python', '-m', 'torch.distributed.launch', @@ -45,10 +46,10 @@ def parse_gpu_ids(gpu_arg: str): """Parse comma-separated GPU IDs and return as a list of integers.""" try: gpu_ids = [int(id) for id in gpu_arg.split(',')] - + if len(gpu_ids) == 1: # Single GPU return gpu_ids[0] - + gpu_ids = sorted(gpu_ids) return gpu_ids except ValueError as e: @@ -60,7 +61,7 @@ def parse_args_netspresso(with_gpus=False): parser = argparse.ArgumentParser(description="Parser for NetsPresso configuration") # -------- User arguments ---------------------------------------- - + if with_gpus: parser.add_argument( '--gpus', type=parse_gpu_ids, default=0, @@ -110,7 +111,7 @@ def parse_args_netspresso(with_gpus=False): def set_arguments(data: Union[Path, str], augmentation: Union[Path, str], model: Union[Path, str], training: Union[Path, str], logging: Union[Path, str], environment: Union[Path, str]) -> DictConfig: - + conf_data = OmegaConf.load(data) conf_augmentation = OmegaConf.load(augmentation) conf_model = OmegaConf.load(model) @@ -125,18 +126,19 @@ def set_arguments(data: Union[Path, str], augmentation: Union[Path, str], conf.merge_with(conf_training) conf.merge_with(conf_logging) conf.merge_with(conf_environment) - + return conf def train_with_yaml_impl(gpus: Union[list, int], data: Union[Path, str], augmentation: Union[Path, str], model: Union[Path, str], training: Union[Path, str], logging: Union[Path, str], environment: Union[Path, str], log_level: str = LOG_LEVEL): - + assert isinstance(gpus, (list, int)) gpu_ids_str = ','.join(map(str, gpus)) if isinstance(gpus, list) else str(gpus) os.environ['CUDA_VISIBLE_DEVICES'] = gpu_ids_str - + torch.cuda.empty_cache() # Reinitialize CUDA to apply the change + if isinstance(gpus, int): conf = set_arguments(data, augmentation, model, training, logging, environment) train_common(conf, log_level=log_level) @@ -146,7 +148,7 @@ def train_with_yaml_impl(gpus: Union[list, int], data: Union[Path, str], augment def train_cli() -> None: args_parsed = parse_args_netspresso(with_gpus=True) - + train_with_yaml_impl( gpus=args_parsed.gpus, data=args_parsed.data, @@ -161,7 +163,7 @@ def train_cli() -> None: def train_cli_without_additional_gpu_check() -> None: args_parsed = parse_args_netspresso(with_gpus=False) - + conf = set_arguments( data=args_parsed.data, augmentation=args_parsed.augmentation, @@ -175,6 +177,6 @@ def train_cli_without_additional_gpu_check() -> None: if __name__ == "__main__": - + # Execute by `run_distributed_training_script` - train_cli_without_additional_gpu_check() \ No newline at end of file + train_cli_without_additional_gpu_check() diff --git a/src/netspresso_trainer/trainer_common.py b/src/netspresso_trainer/trainer_common.py index daa45300..6d76b36a 100644 --- a/src/netspresso_trainer/trainer_common.py +++ b/src/netspresso_trainer/trainer_common.py @@ -29,10 +29,9 @@ def train_common(conf: DictConfig, log_level: Literal['DEBUG', 'INFO', 'WARNING' # TODO: Get model name from checkpoint single_task_model = is_single_task_model(conf.model) - conf_model_sub = conf.model.architecture.full if single_task_model else conf.model.architecture.backbone conf.model.single_task_model = single_task_model - model_name = str(conf_model_sub.name).lower() + model_name = str(conf.model.name).lower() if is_graphmodule_training: model_name += "_graphmodule" diff --git a/src/netspresso_trainer/trainer_inline.py b/src/netspresso_trainer/trainer_inline.py index 556194df..d291bce4 100644 --- a/src/netspresso_trainer/trainer_inline.py +++ b/src/netspresso_trainer/trainer_inline.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import List, Literal, Union +import torch from omegaconf import DictConfig, OmegaConf from netspresso_trainer.cfg import TrainerConfig @@ -12,7 +13,7 @@ def set_struct_recursive(conf: DictConfig, value: bool) -> None: OmegaConf.set_struct(conf, value) - + for _, conf_value in conf.items(): if isinstance(conf_value, DictConfig): set_struct_recursive(conf_value, value) @@ -23,18 +24,26 @@ def export_config_as_yaml(config: TrainerConfig) -> str: return OmegaConf.to_yaml(conf) -def train_with_config(config: TrainerConfig, log_level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] = 'INFO') -> None: +def train_with_config(gpus: str, config: TrainerConfig, log_level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] = 'INFO') -> None: + + gpus: Union[List, int] = parse_gpu_ids(gpus) + assert isinstance(gpus, int), f"Currently, only single-GPU training is supported in this API. Your gpu(s): {gpus}" + + os.environ['CUDA_VISIBLE_DEVICES'] = str(gpus) + torch.cuda.empty_cache() # Reinitialize CUDA to apply the change + conf: DictConfig = OmegaConf.create(config) set_struct_recursive(conf, False) + train_common(conf, log_level=log_level) def train_with_yaml(gpus: str, data: Union[Path, str], augmentation: Union[Path, str], model: Union[Path, str], training: Union[Path, str], logging: Union[Path, str], environment: Union[Path, str], log_level: str = LOG_LEVEL): - + gpus: Union[List, int] = parse_gpu_ids(gpus) - + train_with_yaml_impl( gpus=gpus, data=data, @@ -44,4 +53,4 @@ def train_with_yaml(gpus: str, data: Union[Path, str], augmentation: Union[Path, logging=logging, environment=environment, log_level=log_level - ) \ No newline at end of file + ) diff --git a/src/netspresso_trainer/utils/logger.py b/src/netspresso_trainer/utils/logger.py index bf296de1..6b320a09 100644 --- a/src/netspresso_trainer/utils/logger.py +++ b/src/netspresso_trainer/utils/logger.py @@ -24,7 +24,7 @@ def _custom_logger(name: str, level: str, distributed: bool): else: fmt = '%(asctime)s | %(levelname)s\t\t| %(funcName)s:<%(filename)s>:%(lineno)s >>> %(message)s' logger = logging.getLogger(name) - + if not logger.hasHandlers(): handler = logging.StreamHandler() @@ -46,7 +46,7 @@ def set_logger(logger_name="netspresso_trainer", level: str = 'INFO', distribute print("Skipping timezone setting.") _level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] = level.upper() _custom_logger(logger_name, _level, distributed) - + logger = logging.getLogger(logger_name) if _level == 'DEBUG': logger.setLevel(logging.DEBUG) diff --git a/src/netspresso_trainer/utils/stats.py b/src/netspresso_trainer/utils/stats.py index aacc385d..326721d2 100644 --- a/src/netspresso_trainer/utils/stats.py +++ b/src/netspresso_trainer/utils/stats.py @@ -12,7 +12,7 @@ def get_params_and_macs(model: nn.Module, sample_input: torch.Tensor): sample_input = sample_input.to(get_device(model)) # From v0.0.9 macs, params = _params_and_macs_fvcore(model, sample_input) - + # # Before v0.0.9 # macs, params = _params_and_macs_thop(model, sample_input) @@ -25,4 +25,4 @@ def _params_and_macs_fvcore(model: nn.Module, sample_input: torch.Tensor): def _params_and_macs_thop(model: nn.Module, sample_input: torch.Tensor): macs, params = thop.profile(model, inputs=(sample_input,), verbose=False) - return macs, params \ No newline at end of file + return macs, params diff --git a/tools/config_test.py b/tools/config_test.py index ea1c451d..81f7bc5d 100644 --- a/tools/config_test.py +++ b/tools/config_test.py @@ -8,13 +8,16 @@ if __name__ == "__main__": from netspresso_trainer.cfg import ( + AugmentationConfig, ClassificationAugmentationConfig, ClassificationResNetModelConfig, ColorJitter, + RandomResizedCrop, + RandomHorizontalFlip, ExampleBeansDataset, ) - augmentation_config = ClassificationAugmentationConfig(color_jitter=ColorJitter(colorjitter_p=0.9)) + augmentation_config = ClassificationAugmentationConfig() example_dataset = ExampleBeansDataset example_model = ClassificationResNetModelConfig() cfg = TrainerConfig( @@ -32,12 +35,12 @@ # OK: update value of subclass in the main dataclass cfg_new: TrainerConfig = deepcopy(cfg) - cfg_new.augmentation.color_jitter.saturation = 0.0 + cfg_new.augmentation.transforms[-1].saturation = 0.0 # print(OmegaConf.to_yaml(OmegaConf.structured(cfg_new))) # OK: update value from OmegaConf Config config_new: TrainerConfig = deepcopy(config) - config_new.augmentation.color_jitter.hue = 0.5 + cfg_new.augmentation.transforms[-1].hue = 0.5 # print(OmegaConf.to_yaml(config_new)) diff --git a/train.py b/train.py index e2b937d0..7c286a26 100644 --- a/train.py +++ b/train.py @@ -1,7 +1,7 @@ from netspresso_trainer import train_cli def train_with_inline_cfg(): - from netspresso_trainer import TrainerConfig, train, export_config_as_yaml + from netspresso_trainer import TrainerConfig, train_with_config, export_config_as_yaml from netspresso_trainer.cfg import ClassificationResNetModelConfig, ExampleBeansDataset """ @@ -38,7 +38,9 @@ def train_with_inline_cfg(): print(export_config_as_yaml(cfg)) - train(cfg, log_level='INFO') + train_with_config(gpus="1", + config=cfg, + log_level='INFO') def train_with_inline_yaml(): from netspresso_trainer import train_with_yaml @@ -52,10 +54,10 @@ def train_with_inline_yaml(): if __name__ == '__main__': - train_cli() + # train_cli() # With inline yaml # train_with_inline_yaml() # With inline pythonic config - # train_with_inline_cfg() \ No newline at end of file + train_with_inline_cfg() \ No newline at end of file