In [2]:
import torch
from torchvision.models.segmentation.deeplabv3 import DeepLabHead
from peft import LoraConfig, get_peft_model

In [3]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [36]:
# backbone: resnet
from torchvision import models
res_model = models.resnet50(pretrained=True)

print_trainable_parameters(res_model)
for name, module in res_model.named_modules():
    print(name)
print(res_model)

trainable params: 25557032 || all params: 25557032 || trainable%: 100.00

conv1
bn1
relu
maxpool
layer1
layer1.0
layer1.0.conv1
layer1.0.bn1
layer1.0.conv2
layer1.0.bn2
layer1.0.conv3
layer1.0.bn3
layer1.0.relu
layer1.0.downsample
layer1.0.downsample.0
layer1.0.downsample.1
layer1.1
layer1.1.conv1
layer1.1.bn1
layer1.1.conv2
layer1.1.bn2
layer1.1.conv3
layer1.1.bn3
layer1.1.relu
layer1.2
layer1.2.conv1
layer1.2.bn1
layer1.2.conv2
layer1.2.bn2
layer1.2.conv3
layer1.2.bn3
layer1.2.relu
layer2
layer2.0
layer2.0.conv1
layer2.0.bn1
layer2.0.conv2
layer2.0.bn2
layer2.0.conv3
layer2.0.bn3
layer2.0.relu
layer2.0.downsample
layer2.0.downsample.0
layer2.0.downsample.1
layer2.1
layer2.1.conv1
layer2.1.bn1
layer2.1.conv2
layer2.1.bn2
layer2.1.conv3
layer2.1.bn3
layer2.1.relu
layer2.2
layer2.2.conv1
layer2.2.bn1
layer2.2.conv2
layer2.2.bn2
layer2.2.conv3
layer2.2.bn3
layer2.2.relu
layer2.3
layer2.3.conv1
layer2.3.bn1
layer2.3.conv2
layer2.3.bn2
layer2.3.conv3
layer2.3.bn3
layer2.3.relu
layer3
layer

In [37]:
# backbone: resnet + LoRA


config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["conv3"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["fc"],
)
lora_model = get_peft_model(res_model, config)
print_trainable_parameters(lora_model)

trainable params: 2351080 || all params: 27908112 || trainable%: 8.42


In [49]:
# segmentation_model 1.0: res50 + DeepLabV3
class CustomDeepLabV3(torch.nn.Module):
    def __init__(self, backbone, classifier):
        super().__init__()
        self.backbone = backbone
        self.classifier = classifier

    def forward(self, x):
        input_shape = x.shape[-2:]
        features = self.backbone(x)
        #print("size after downsampling", features.shape)
        x = self.classifier(features)
        #print("size after segmentation head", x.shape)
        # spatial dimensions of this map are smaller than the original input image due to the downsampling operations in the backbone.
        # We can upsample the output to the size of the input image using interpolation
        x = torch.nn.functional.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
        #print("size after upsampling", x.shape)
        return {'out': x}

backbone = models.resnet50(pretrained=True)
backbone = torch.nn.Sequential(*(list(backbone.children())[:-2])) # remove the last two layers
# backbone.add_module('avgpool', torch.nn.AdaptiveAvgPool2d(output_size=(1, 1)))

num_classes = 20
# the segmentation head is responsible for making the final pixel-wise predictions
segmentation_head = DeepLabHead(2048, num_classes)
# 2048 is the number of output channels in the resnet50 backbone
for param in backbone.parameters():
    param.requires_grad = False
    
# Then use your custom model instead of the original one
model = CustomDeepLabV3(backbone, segmentation_head)

print_trainable_parameters(model)
for name, module in model.named_modules():
    print(name)
print(model)

trainable params: 16130580 || all params: 39638612 || trainable%: 40.69

backbone
backbone.0
backbone.1
backbone.2
backbone.3
backbone.4
backbone.4.0
backbone.4.0.conv1
backbone.4.0.bn1
backbone.4.0.conv2
backbone.4.0.bn2
backbone.4.0.conv3
backbone.4.0.bn3
backbone.4.0.relu
backbone.4.0.downsample
backbone.4.0.downsample.0
backbone.4.0.downsample.1
backbone.4.1
backbone.4.1.conv1
backbone.4.1.bn1
backbone.4.1.conv2
backbone.4.1.bn2
backbone.4.1.conv3
backbone.4.1.bn3
backbone.4.1.relu
backbone.4.2
backbone.4.2.conv1
backbone.4.2.bn1
backbone.4.2.conv2
backbone.4.2.bn2
backbone.4.2.conv3
backbone.4.2.bn3
backbone.4.2.relu
backbone.5
backbone.5.0
backbone.5.0.conv1
backbone.5.0.bn1
backbone.5.0.conv2
backbone.5.0.bn2
backbone.5.0.conv3
backbone.5.0.bn3
backbone.5.0.relu
backbone.5.0.downsample
backbone.5.0.downsample.0
backbone.5.0.downsample.1
backbone.5.1
backbone.5.1.conv1
backbone.5.1.bn1
backbone.5.1.conv2
backbone.5.1.bn2
backbone.5.1.conv3
backbone.5.1.bn3
backbone.5.1.relu
backb

In [51]:
# segmentation_model 1.1: res50 + LoRA(conv2) + DeepLabV3
class CustomDeepLabV3(torch.nn.Module):
    def __init__(self, backbone, classifier):
        super().__init__()
        self.backbone = backbone
        self.classifier = classifier

    def forward(self, x):
        input_shape = x.shape[-2:]
        features = self.backbone(x)
        #print("size after downsampling", features.shape)
        x = self.classifier(features)
        #print("size after segmentation head", x.shape)
        # spatial dimensions of this map are smaller than the original input image due to the downsampling operations in the backbone.
        # We can upsample the output to the size of the input image using interpolation
        x = torch.nn.functional.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
        #print("size after upsampling", x.shape)
        return {'out': x}

backbone = models.resnet50(pretrained=True)
backbone = torch.nn.Sequential(*(list(backbone.children())[:-2])) # remove the last two layers
# backbone.add_module('avgpool', torch.nn.AdaptiveAvgPool2d(output_size=(1, 1)))

num_classes = 20
# the segmentation head is responsible for making the final pixel-wise predictions
segmentation_head = DeepLabHead(2048, num_classes)
# 2048 is the number of output channels in the resnet50 backbone

# Then use your custom model instead of the original one

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["conv2"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["fc"],
)
backbone = get_peft_model(backbone, config)

model = CustomDeepLabV3(backbone, segmentation_head)
    
print_trainable_parameters(model)
for name, module in model.named_modules():
    print(name)
print(model)

trainable params: 16734740 || all params: 40242772 || trainable%: 41.58

backbone
backbone.base_model
backbone.base_model.model
backbone.base_model.model.0
backbone.base_model.model.1
backbone.base_model.model.2
backbone.base_model.model.3
backbone.base_model.model.4
backbone.base_model.model.4.0
backbone.base_model.model.4.0.conv1
backbone.base_model.model.4.0.bn1
backbone.base_model.model.4.0.conv2
backbone.base_model.model.4.0.conv2.base_layer
backbone.base_model.model.4.0.conv2.lora_dropout
backbone.base_model.model.4.0.conv2.lora_dropout.default
backbone.base_model.model.4.0.conv2.lora_A
backbone.base_model.model.4.0.conv2.lora_A.default
backbone.base_model.model.4.0.conv2.lora_B
backbone.base_model.model.4.0.conv2.lora_B.default
backbone.base_model.model.4.0.conv2.lora_embedding_A
backbone.base_model.model.4.0.conv2.lora_embedding_B
backbone.base_model.model.4.0.bn2
backbone.base_model.model.4.0.conv3
backbone.base_model.model.4.0.bn3
backbone.base_model.model.4.0.relu
backbone.b

In [52]:
# segmentation_model 1.1: res50 + LoRA(conv2) + DeepLabV3
class CustomDeepLabV3(torch.nn.Module):
    def __init__(self, backbone, classifier):
        super().__init__()
        self.backbone = backbone
        self.classifier = classifier

    def forward(self, x):
        input_shape = x.shape[-2:]
        features = self.backbone(x)
        #print("size after downsampling", features.shape)
        x = self.classifier(features)
        #print("size after segmentation head", x.shape)
        # spatial dimensions of this map are smaller than the original input image due to the downsampling operations in the backbone.
        # We can upsample the output to the size of the input image using interpolation
        x = torch.nn.functional.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
        #print("size after upsampling", x.shape)
        return {'out': x}

backbone = models.resnet50(pretrained=True)
backbone = torch.nn.Sequential(*(list(backbone.children())[:-2])) # remove the last two layers
# backbone.add_module('avgpool', torch.nn.AdaptiveAvgPool2d(output_size=(1, 1)))

num_classes = 20
# the segmentation head is responsible for making the final pixel-wise predictions
segmentation_head = DeepLabHead(2048, num_classes)
# 2048 is the number of output channels in the resnet50 backbone

# Then use your custom model instead of the original one

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["conv3"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["fc"],
)
backbone = get_peft_model(backbone, config)

model = CustomDeepLabV3(backbone, segmentation_head)

print_trainable_parameters(model)
for name, module in model.named_modules():
    print(name)
print(model)

trainable params: 16432660 || all params: 39940692 || trainable%: 41.14

backbone
backbone.base_model
backbone.base_model.model
backbone.base_model.model.0
backbone.base_model.model.1
backbone.base_model.model.2
backbone.base_model.model.3
backbone.base_model.model.4
backbone.base_model.model.4.0
backbone.base_model.model.4.0.conv1
backbone.base_model.model.4.0.bn1
backbone.base_model.model.4.0.conv2
backbone.base_model.model.4.0.bn2
backbone.base_model.model.4.0.conv3
backbone.base_model.model.4.0.conv3.base_layer
backbone.base_model.model.4.0.conv3.lora_dropout
backbone.base_model.model.4.0.conv3.lora_dropout.default
backbone.base_model.model.4.0.conv3.lora_A
backbone.base_model.model.4.0.conv3.lora_A.default
backbone.base_model.model.4.0.conv3.lora_B
backbone.base_model.model.4.0.conv3.lora_B.default
backbone.base_model.model.4.0.conv3.lora_embedding_A
backbone.base_model.model.4.0.conv3.lora_embedding_B
backbone.base_model.model.4.0.bn3
backbone.base_model.model.4.0.relu
backbone.b

In [70]:
# segmentation_model 1.3: res50 + LoRA(conv2, conv3) + DeepLabV3
class CustomDeepLabV3(torch.nn.Module):
    def __init__(self, backbone, classifier):
        super().__init__()
        self.backbone = backbone
        self.classifier = classifier

    def forward(self, x):
        input_shape = x.shape[-2:]
        features = self.backbone(x)
        #print("size after downsampling", features.shape)
        x = self.classifier(features)
        #print("size after segmentation head", x.shape)
        # spatial dimensions of this map are smaller than the original input image due to the downsampling operations in the backbone.
        # We can upsample the output to the size of the input image using interpolation
        x = torch.nn.functional.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
        #print("size after upsampling", x.shape)
        return {'out': x}

backbone = models.resnet50(pretrained=True)
backbone = torch.nn.Sequential(*(list(backbone.children())[:-2])) # remove the last two layers
# backbone.add_module('avgpool', torch.nn.AdaptiveAvgPool2d(output_size=(1, 1)))

num_classes = 20
# the segmentation head is responsible for making the final pixel-wise predictions
segmentation_head = DeepLabHead(2048, num_classes)
# 2048 is the number of output channels in the resnet50 backbone

# Then use your custom model instead of the original one

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["conv2","conv3"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["fc"],
)
backbone = get_peft_model(backbone, config)

model = CustomDeepLabV3(backbone, segmentation_head)

print_trainable_parameters(model)
for name, module in model.named_modules():
    print(name)
print(model)



trainable params: 17036820 || all params: 40544852 || trainable%: 42.02

backbone
backbone.base_model
backbone.base_model.model
backbone.base_model.model.0
backbone.base_model.model.1
backbone.base_model.model.2
backbone.base_model.model.3
backbone.base_model.model.4
backbone.base_model.model.4.0
backbone.base_model.model.4.0.conv1
backbone.base_model.model.4.0.bn1
backbone.base_model.model.4.0.conv2
backbone.base_model.model.4.0.conv2.base_layer
backbone.base_model.model.4.0.conv2.lora_dropout
backbone.base_model.model.4.0.conv2.lora_dropout.default
backbone.base_model.model.4.0.conv2.lora_A
backbone.base_model.model.4.0.conv2.lora_A.default
backbone.base_model.model.4.0.conv2.lora_B
backbone.base_model.model.4.0.conv2.lora_B.default
backbone.base_model.model.4.0.conv2.lora_embedding_A
backbone.base_model.model.4.0.conv2.lora_embedding_B
backbone.base_model.model.4.0.bn2
backbone.base_model.model.4.0.conv3
backbone.base_model.model.4.0.conv3.base_layer
backbone.base_model.model.4.0.co

In [39]:
# backbone: viT
model_checkpoint = "google/vit-base-patch16-224-in21k"  # pre-trained model from which to fine-tune
# pretrained on ImageNet-21k

from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    model_checkpoint,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

print_trainable_parameters(model)
for name, module in model.named_modules():
    print(name)
print(model)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 85800194 || all params: 85800194 || trainable%: 100.00

vit
vit.embeddings
vit.embeddings.patch_embeddings
vit.embeddings.patch_embeddings.projection
vit.embeddings.dropout
vit.encoder
vit.encoder.layer
vit.encoder.layer.0
vit.encoder.layer.0.attention
vit.encoder.layer.0.attention.attention
vit.encoder.layer.0.attention.attention.query
vit.encoder.layer.0.attention.attention.key
vit.encoder.layer.0.attention.attention.value
vit.encoder.layer.0.attention.attention.dropout
vit.encoder.layer.0.attention.output
vit.encoder.layer.0.attention.output.dense
vit.encoder.layer.0.attention.output.dropout
vit.encoder.layer.0.intermediate
vit.encoder.layer.0.intermediate.dense
vit.encoder.layer.0.intermediate.intermediate_act_fn
vit.encoder.layer.0.output
vit.encoder.layer.0.output.dense
vit.encoder.layer.0.output.dropout
vit.encoder.layer.0.layernorm_before
vit.encoder.layer.0.layernorm_after
vit.encoder.layer.1
vit.encoder.layer.1.attention
vit.encoder.layer.1.attention.attenti

In [9]:
# segformer 1.0: segformer
from transformers import SegformerForSemanticSegmentation

model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b1-finetuned-cityscapes-1024-1024")

num_classes = 20

for param in model.parameters():
    param.requires_grad = False

print_trainable_parameters(model)

trainable params: 0 || all params: 13682131 || trainable%: 0.00


In [10]:
# segformer 1.1: segformer + linear probe(20 classes) 
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b1-finetuned-ade-512-512")
for param in model.parameters():
    param.requires_grad = False

model.decode_head.classifier = torch.nn.Conv2d(256, 20, kernel_size=(1, 1), stride=(1, 1))
print_trainable_parameters(model)


trainable params: 5140 || all params: 13682388 || trainable%: 0.04


In [11]:
# segformer 1.2: segformer + LoRA(attention,MLP, r = 64) + linear probe
from transformers import SegformerForSemanticSegmentation

model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b1-finetuned-cityscapes-1024-1024")

num_classes = 20

config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["dense","dense2"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["fc"]
)
model = get_peft_model(model, config)
model.decode_head.classifier = torch.nn.Conv2d(256, 20, kernel_size=(1, 1), stride=(1, 1))

print_trainable_parameters(model)

trainable params: 922644 || all params: 14599892 || trainable%: 6.32


In [12]:
# segformer 1.3: segformer + LoRA(attention, r = 512) + linear probe
from transformers import SegformerForSemanticSegmentation

model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b1-finetuned-cityscapes-1024-1024")

num_classes = 20

config = LoraConfig(
    r=512, 
    lora_alpha=16, 
    target_modules=["dense"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["fc"]
)
model = get_peft_model(model, config)
model.decode_head.classifier = torch.nn.Conv2d(256, 20, kernel_size=(1, 1), stride=(1, 1))

print_trainable_parameters(model)

trainable params: 2097152 || all params: 15779283 || trainable%: 13.29


In [13]:
# segformer 2.0: segformer + linear probe
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b1-finetuned-ade-512-512")
for param in model.parameters():
    param.requires_grad = False
model.decode_head.classifier = torch.nn.Conv2d(256, 20, kernel_size=(1, 1), stride=(1, 1))
print_trainable_parameters(model)

trainable params: 5140 || all params: 13682388 || trainable%: 0.04


In [14]:
# segformer 2.1: segformer + LoRA(attention,MLP, r = 64) + linear probe
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b1-finetuned-ade-512-512")

config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["dense","dense2"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["fc"]
)
model = get_peft_model(model, config)
model.decode_head.classifier = torch.nn.Conv2d(256, 20, kernel_size=(1, 1), stride=(1, 1))
print_trainable_parameters(model)
    

trainable params: 922644 || all params: 14599892 || trainable%: 6.32


In [18]:
# segformer 2.2: segformer + LoRA(attention, r = 512) + linear probe
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b4-finetuned-ade-512-512")

config = LoraConfig(
    r=512, 
    lora_alpha=16, 
    target_modules=["dense"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["fc"]
)
model = get_peft_model(model, config)
model.decode_head.classifier = torch.nn.Conv2d(256, 20, kernel_size=(1, 1), stride=(1, 1))

print_trainable_parameters(model)


OSError: nvidia/segformer-b6-finetuned-ade-512-512 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`